[Teamcoco] Move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.flickr import FlickrIE
33 from .extractor.funnyordie import FunnyOrDieIE
34 from .extractor.gametrailers import GametrailersIE
35 from .extractor.generic import GenericIE
36 from .extractor.googleplus import GooglePlusIE
37 from .extractor.googlesearch import GoogleSearchIE
38 from .extractor.howcast import HowcastIE
39 from .extractor.hypem import HypemIE
40 from .extractor.ina import InaIE
41 from .extractor.infoq import InfoQIE
42 from .extractor.justintv import JustinTVIE
43 from .extractor.keek import KeekIE
44 from .extractor.liveleak import LiveLeakIE
45 from .extractor.metacafe import MetacafeIE
46 from .extractor.mixcloud import MixcloudIE
47 from .extractor.mtv import MTVIE
48 from .extractor.myspass import MySpassIE
49 from .extractor.myvideo import MyVideoIE
50 from .extractor.nba import NBAIE
51 from .extractor.statigram import StatigramIE
52 from .extractor.photobucket import PhotobucketIE
53 from .extractor.pornotube import PornotubeIE
54 from .extractor.rbmaradio import RBMARadioIE
55 from .extractor.redtube import RedTubeIE
56 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
57 from .extractor.spiegel import SpiegelIE
58 from .extractor.stanfordoc import StanfordOpenClassroomIE
59 from .extractor.steam import SteamIE
60 from .extractor.teamcoco import TeamcocoIE
61 from .extractor.ted import TEDIE
62 from .extractor.tumblr import TumblrIE
63 from .extractor.ustream import UstreamIE
64 from .extractor.vbox7 import Vbox7IE
65 from .extractor.vimeo import VimeoIE
66 from .extractor.vine import VineIE
67 from .extractor.worldstarhiphop import WorldStarHipHopIE
68 from .extractor.xnxx import XNXXIE
69 from .extractor.xvideos import XVideosIE
70 from .extractor.yahoo import YahooIE, YahooSearchIE
71 from .extractor.youjizz import YouJizzIE
72 from .extractor.youku import YoukuIE
73 from .extractor.youporn import YouPornIE
74 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
75 from .extractor.zdf import ZDFIE
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116 class XHamsterIE(InfoExtractor):
117     """Information Extractor for xHamster"""
118     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
119
120     def _real_extract(self,url):
121         mobj = re.match(self._VALID_URL, url)
122
123         video_id = mobj.group('id')
124         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
125         webpage = self._download_webpage(mrss_url, video_id)
126
127         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
128         if mobj is None:
129             raise ExtractorError(u'Unable to extract media URL')
130         if len(mobj.group('server')) == 0:
131             video_url = compat_urllib_parse.unquote(mobj.group('file'))
132         else:
133             video_url = mobj.group('server')+'/key='+mobj.group('file')
134         video_extension = video_url.split('.')[-1]
135
136         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
137             webpage, u'title')
138
139         # Can't see the description anywhere in the UI
140         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
141         #     webpage, u'description', fatal=False)
142         # if video_description: video_description = unescapeHTML(video_description)
143
144         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
145         if mobj:
146             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
147         else:
148             video_upload_date = None
149             self._downloader.report_warning(u'Unable to extract upload date')
150
151         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
152             webpage, u'uploader id', default=u'anonymous')
153
154         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
155             webpage, u'thumbnail', fatal=False)
156
157         return [{
158             'id':       video_id,
159             'url':      video_url,
160             'ext':      video_extension,
161             'title':    video_title,
162             # 'description': video_description,
163             'upload_date': video_upload_date,
164             'uploader_id': video_uploader_id,
165             'thumbnail': video_thumbnail
166         }]
167
168
169
170
171
172 def gen_extractors():
173     """ Return a list of an instance of every supported extractor.
174     The order does matter; the first extractor matched is the one handling the URL.
175     """
176     return [
177         YoutubePlaylistIE(),
178         YoutubeChannelIE(),
179         YoutubeUserIE(),
180         YoutubeSearchIE(),
181         YoutubeIE(),
182         MetacafeIE(),
183         DailymotionIE(),
184         GoogleSearchIE(),
185         PhotobucketIE(),
186         YahooIE(),
187         YahooSearchIE(),
188         DepositFilesIE(),
189         FacebookIE(),
190         BlipTVIE(),
191         BlipTVUserIE(),
192         VimeoIE(),
193         MyVideoIE(),
194         ComedyCentralIE(),
195         EscapistIE(),
196         CollegeHumorIE(),
197         XVideosIE(),
198         SoundcloudSetIE(),
199         SoundcloudIE(),
200         InfoQIE(),
201         MixcloudIE(),
202         StanfordOpenClassroomIE(),
203         MTVIE(),
204         YoukuIE(),
205         XNXXIE(),
206         YouJizzIE(),
207         PornotubeIE(),
208         YouPornIE(),
209         GooglePlusIE(),
210         ArteTvIE(),
211         NBAIE(),
212         WorldStarHipHopIE(),
213         JustinTVIE(),
214         FunnyOrDieIE(),
215         SteamIE(),
216         UstreamIE(),
217         RBMARadioIE(),
218         EightTracksIE(),
219         KeekIE(),
220         TEDIE(),
221         MySpassIE(),
222         SpiegelIE(),
223         LiveLeakIE(),
224         ARDIE(),
225         ZDFIE(),
226         TumblrIE(),
227         BandcampIE(),
228         RedTubeIE(),
229         InaIE(),
230         HowcastIE(),
231         VineIE(),
232         FlickrIE(),
233         TeamcocoIE(),
234         XHamsterIE(),
235         HypemIE(),
236         Vbox7IE(),
237         GametrailersIE(),
238         StatigramIE(),
239         GenericIE()
240     ]
241
242 def get_info_extractor(ie_name):
243     """Returns the info extractor class with the given ie_name"""
244     return globals()[ie_name+'IE']