1b7e5649d4dbfd386aa17f3b4d2f59c0df808b85
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.ted import TEDIE
45 from .extractor.vimeo import VimeoIE
46 from .extractor.xvideos import XVideosIE
47 from .extractor.yahoo import YahooIE, YahooSearchIE
48 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
49 from .extractor.zdf import ZDFIE
50
51
52
53
54
55
56
57 class YoukuIE(InfoExtractor):
58     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59
60     def _gen_sid(self):
61         nowTime = int(time.time() * 1000)
62         random1 = random.randint(1000,1998)
63         random2 = random.randint(1000,9999)
64
65         return "%d%d%d" %(nowTime,random1,random2)
66
67     def _get_file_ID_mix_string(self, seed):
68         mixed = []
69         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
70         seed = float(seed)
71         for i in range(len(source)):
72             seed  =  (seed * 211 + 30031 ) % 65536
73             index  =  math.floor(seed / 65536 * len(source) )
74             mixed.append(source[int(index)])
75             source.remove(source[int(index)])
76         #return ''.join(mixed)
77         return mixed
78
79     def _get_file_id(self, fileId, seed):
80         mixed = self._get_file_ID_mix_string(seed)
81         ids = fileId.split('*')
82         realId = []
83         for ch in ids:
84             if ch:
85                 realId.append(mixed[int(ch)])
86         return ''.join(realId)
87
88     def _real_extract(self, url):
89         mobj = re.match(self._VALID_URL, url)
90         if mobj is None:
91             raise ExtractorError(u'Invalid URL: %s' % url)
92         video_id = mobj.group('ID')
93
94         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
95
96         jsondata = self._download_webpage(info_url, video_id)
97
98         self.report_extraction(video_id)
99         try:
100             config = json.loads(jsondata)
101
102             video_title =  config['data'][0]['title']
103             seed = config['data'][0]['seed']
104
105             format = self._downloader.params.get('format', None)
106             supported_format = list(config['data'][0]['streamfileids'].keys())
107
108             if format is None or format == 'best':
109                 if 'hd2' in supported_format:
110                     format = 'hd2'
111                 else:
112                     format = 'flv'
113                 ext = u'flv'
114             elif format == 'worst':
115                 format = 'mp4'
116                 ext = u'mp4'
117             else:
118                 format = 'flv'
119                 ext = u'flv'
120
121
122             fileid = config['data'][0]['streamfileids'][format]
123             keys = [s['k'] for s in config['data'][0]['segs'][format]]
124         except (UnicodeDecodeError, ValueError, KeyError):
125             raise ExtractorError(u'Unable to extract info section')
126
127         files_info=[]
128         sid = self._gen_sid()
129         fileid = self._get_file_id(fileid, seed)
130
131         #column 8,9 of fileid represent the segment number
132         #fileid[7:9] should be changed
133         for index, key in enumerate(keys):
134
135             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
136             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
137
138             info = {
139                 'id': '%s_part%02d' % (video_id, index),
140                 'url': download_url,
141                 'uploader': None,
142                 'upload_date': None,
143                 'title': video_title,
144                 'ext': ext,
145             }
146             files_info.append(info)
147
148         return files_info
149
150
151 class XNXXIE(InfoExtractor):
152     """Information extractor for xnxx.com"""
153
154     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
155     IE_NAME = u'xnxx'
156     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
157     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
158     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
159
160     def _real_extract(self, url):
161         mobj = re.match(self._VALID_URL, url)
162         if mobj is None:
163             raise ExtractorError(u'Invalid URL: %s' % url)
164         video_id = mobj.group(1)
165
166         # Get webpage content
167         webpage = self._download_webpage(url, video_id)
168
169         video_url = self._search_regex(self.VIDEO_URL_RE,
170             webpage, u'video URL')
171         video_url = compat_urllib_parse.unquote(video_url)
172
173         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
174             webpage, u'title')
175
176         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
177             webpage, u'thumbnail', fatal=False)
178
179         return [{
180             'id': video_id,
181             'url': video_url,
182             'uploader': None,
183             'upload_date': None,
184             'title': video_title,
185             'ext': 'flv',
186             'thumbnail': video_thumbnail,
187             'description': None,
188         }]
189
190
191
192
193 class JustinTVIE(InfoExtractor):
194     """Information extractor for justin.tv and twitch.tv"""
195     # TODO: One broadcast may be split into multiple videos. The key
196     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
197     # starts at 1 and increases. Can we treat all parts as one video?
198
199     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
200         (?:
201             (?P<channelid>[^/]+)|
202             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
203             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
204         )
205         /?(?:\#.*)?$
206         """
207     _JUSTIN_PAGE_LIMIT = 100
208     IE_NAME = u'justin.tv'
209
210     def report_download_page(self, channel, offset):
211         """Report attempt to download a single page of videos."""
212         self.to_screen(u'%s: Downloading video information from %d to %d' %
213                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
214
215     # Return count of items, list of *valid* items
216     def _parse_page(self, url, video_id):
217         webpage = self._download_webpage(url, video_id,
218                                          u'Downloading video info JSON',
219                                          u'unable to download video info JSON')
220
221         response = json.loads(webpage)
222         if type(response) != list:
223             error_text = response.get('error', 'unknown error')
224             raise ExtractorError(u'Justin.tv API: %s' % error_text)
225         info = []
226         for clip in response:
227             video_url = clip['video_file_url']
228             if video_url:
229                 video_extension = os.path.splitext(video_url)[1][1:]
230                 video_date = re.sub('-', '', clip['start_time'][:10])
231                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
232                 video_id = clip['id']
233                 video_title = clip.get('title', video_id)
234                 info.append({
235                     'id': video_id,
236                     'url': video_url,
237                     'title': video_title,
238                     'uploader': clip.get('channel_name', video_uploader_id),
239                     'uploader_id': video_uploader_id,
240                     'upload_date': video_date,
241                     'ext': video_extension,
242                 })
243         return (len(response), info)
244
245     def _real_extract(self, url):
246         mobj = re.match(self._VALID_URL, url)
247         if mobj is None:
248             raise ExtractorError(u'invalid URL: %s' % url)
249
250         api_base = 'http://api.justin.tv'
251         paged = False
252         if mobj.group('channelid'):
253             paged = True
254             video_id = mobj.group('channelid')
255             api = api_base + '/channel/archives/%s.json' % video_id
256         elif mobj.group('chapterid'):
257             chapter_id = mobj.group('chapterid')
258
259             webpage = self._download_webpage(url, chapter_id)
260             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
261             if not m:
262                 raise ExtractorError(u'Cannot find archive of a chapter')
263             archive_id = m.group(1)
264
265             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
266             chapter_info_xml = self._download_webpage(api, chapter_id,
267                                              note=u'Downloading chapter information',
268                                              errnote=u'Chapter information download failed')
269             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
270             for a in doc.findall('.//archive'):
271                 if archive_id == a.find('./id').text:
272                     break
273             else:
274                 raise ExtractorError(u'Could not find chapter in chapter information')
275
276             video_url = a.find('./video_file_url').text
277             video_ext = video_url.rpartition('.')[2] or u'flv'
278
279             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
280             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
281                                    note='Downloading chapter metadata',
282                                    errnote='Download of chapter metadata failed')
283             chapter_info = json.loads(chapter_info_json)
284
285             bracket_start = int(doc.find('.//bracket_start').text)
286             bracket_end = int(doc.find('.//bracket_end').text)
287
288             # TODO determine start (and probably fix up file)
289             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
290             #video_url += u'?start=' + TODO:start_timestamp
291             # bracket_start is 13290, but we want 51670615
292             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
293                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
294
295             info = {
296                 'id': u'c' + chapter_id,
297                 'url': video_url,
298                 'ext': video_ext,
299                 'title': chapter_info['title'],
300                 'thumbnail': chapter_info['preview'],
301                 'description': chapter_info['description'],
302                 'uploader': chapter_info['channel']['display_name'],
303                 'uploader_id': chapter_info['channel']['name'],
304             }
305             return [info]
306         else:
307             video_id = mobj.group('videoid')
308             api = api_base + '/broadcast/by_archive/%s.json' % video_id
309
310         self.report_extraction(video_id)
311
312         info = []
313         offset = 0
314         limit = self._JUSTIN_PAGE_LIMIT
315         while True:
316             if paged:
317                 self.report_download_page(video_id, offset)
318             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
319             page_count, page_info = self._parse_page(page_url, video_id)
320             info.extend(page_info)
321             if not paged or page_count != limit:
322                 break
323             offset += limit
324         return info
325
326 class FunnyOrDieIE(InfoExtractor):
327     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
328
329     def _real_extract(self, url):
330         mobj = re.match(self._VALID_URL, url)
331         if mobj is None:
332             raise ExtractorError(u'invalid URL: %s' % url)
333
334         video_id = mobj.group('id')
335         webpage = self._download_webpage(url, video_id)
336
337         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
338             webpage, u'video URL', flags=re.DOTALL)
339
340         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
341             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
342
343         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
344             webpage, u'description', fatal=False, flags=re.DOTALL)
345
346         info = {
347             'id': video_id,
348             'url': video_url,
349             'ext': 'mp4',
350             'title': title,
351             'description': video_description,
352         }
353         return [info]
354
355 class SteamIE(InfoExtractor):
356     _VALID_URL = r"""http://store\.steampowered\.com/
357                 (agecheck/)?
358                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
359                 (?P<gameID>\d+)/?
360                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
361                 """
362     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
363     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
364
365     @classmethod
366     def suitable(cls, url):
367         """Receives a URL and returns True if suitable for this IE."""
368         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
369
370     def _real_extract(self, url):
371         m = re.match(self._VALID_URL, url, re.VERBOSE)
372         gameID = m.group('gameID')
373
374         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
375         webpage = self._download_webpage(videourl, gameID)
376
377         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
378             videourl = self._AGECHECK_TEMPLATE % gameID
379             self.report_age_confirmation()
380             webpage = self._download_webpage(videourl, gameID)
381
382         self.report_extraction(gameID)
383         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
384                                              webpage, 'game title')
385
386         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
387         mweb = re.finditer(urlRE, webpage)
388         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
389         titles = re.finditer(namesRE, webpage)
390         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
391         thumbs = re.finditer(thumbsRE, webpage)
392         videos = []
393         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
394             video_id = vid.group('videoID')
395             title = vtitle.group('videoName')
396             video_url = vid.group('videoURL')
397             video_thumb = thumb.group('thumbnail')
398             if not video_url:
399                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
400             info = {
401                 'id':video_id,
402                 'url':video_url,
403                 'ext': 'flv',
404                 'title': unescapeHTML(title),
405                 'thumbnail': video_thumb
406                   }
407             videos.append(info)
408         return [self.playlist_result(videos, gameID, game_title)]
409
410 class UstreamIE(InfoExtractor):
411     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
412     IE_NAME = u'ustream'
413
414     def _real_extract(self, url):
415         m = re.match(self._VALID_URL, url)
416         video_id = m.group('videoID')
417
418         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
419         webpage = self._download_webpage(url, video_id)
420
421         self.report_extraction(video_id)
422
423         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
424             webpage, u'title')
425
426         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
427             webpage, u'uploader', fatal=False, flags=re.DOTALL)
428
429         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
430             webpage, u'thumbnail', fatal=False)
431
432         info = {
433                 'id': video_id,
434                 'url': video_url,
435                 'ext': 'flv',
436                 'title': video_title,
437                 'uploader': uploader,
438                 'thumbnail': thumbnail,
439                }
440         return info
441
442 class WorldStarHipHopIE(InfoExtractor):
443     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
444     IE_NAME = u'WorldStarHipHop'
445
446     def _real_extract(self, url):
447         m = re.match(self._VALID_URL, url)
448         video_id = m.group('id')
449
450         webpage_src = self._download_webpage(url, video_id)
451
452         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
453             webpage_src, u'video URL')
454
455         if 'mp4' in video_url:
456             ext = 'mp4'
457         else:
458             ext = 'flv'
459
460         video_title = self._html_search_regex(r"<title>(.*)</title>",
461             webpage_src, u'title')
462
463         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
464         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
465             webpage_src, u'thumbnail', fatal=False)
466
467         if not thumbnail:
468             _title = r"""candytitles.*>(.*)</span>"""
469             mobj = re.search(_title, webpage_src)
470             if mobj is not None:
471                 video_title = mobj.group(1)
472
473         results = [{
474                     'id': video_id,
475                     'url' : video_url,
476                     'title' : video_title,
477                     'thumbnail' : thumbnail,
478                     'ext' : ext,
479                     }]
480         return results
481
482 class RBMARadioIE(InfoExtractor):
483     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
484
485     def _real_extract(self, url):
486         m = re.match(self._VALID_URL, url)
487         video_id = m.group('videoID')
488
489         webpage = self._download_webpage(url, video_id)
490
491         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
492             webpage, u'json data', flags=re.MULTILINE)
493
494         try:
495             data = json.loads(json_data)
496         except ValueError as e:
497             raise ExtractorError(u'Invalid JSON: ' + str(e))
498
499         video_url = data['akamai_url'] + '&cbr=256'
500         url_parts = compat_urllib_parse_urlparse(video_url)
501         video_ext = url_parts.path.rpartition('.')[2]
502         info = {
503                 'id': video_id,
504                 'url': video_url,
505                 'ext': video_ext,
506                 'title': data['title'],
507                 'description': data.get('teaser_text'),
508                 'location': data.get('country_of_origin'),
509                 'uploader': data.get('host', {}).get('name'),
510                 'uploader_id': data.get('host', {}).get('slug'),
511                 'thumbnail': data.get('image', {}).get('large_url_2x'),
512                 'duration': data.get('duration'),
513         }
514         return [info]
515
516
517 class YouPornIE(InfoExtractor):
518     """Information extractor for youporn.com."""
519     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
520
521     def _print_formats(self, formats):
522         """Print all available formats"""
523         print(u'Available formats:')
524         print(u'ext\t\tformat')
525         print(u'---------------------------------')
526         for format in formats:
527             print(u'%s\t\t%s'  % (format['ext'], format['format']))
528
529     def _specific(self, req_format, formats):
530         for x in formats:
531             if(x["format"]==req_format):
532                 return x
533         return None
534
535     def _real_extract(self, url):
536         mobj = re.match(self._VALID_URL, url)
537         if mobj is None:
538             raise ExtractorError(u'Invalid URL: %s' % url)
539         video_id = mobj.group('videoid')
540
541         req = compat_urllib_request.Request(url)
542         req.add_header('Cookie', 'age_verified=1')
543         webpage = self._download_webpage(req, video_id)
544
545         # Get JSON parameters
546         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
547         try:
548             params = json.loads(json_params)
549         except:
550             raise ExtractorError(u'Invalid JSON')
551
552         self.report_extraction(video_id)
553         try:
554             video_title = params['title']
555             upload_date = unified_strdate(params['release_date_f'])
556             video_description = params['description']
557             video_uploader = params['submitted_by']
558             thumbnail = params['thumbnails'][0]['image']
559         except KeyError:
560             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
561
562         # Get all of the formats available
563         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
564         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
565             webpage, u'download list').strip()
566
567         # Get all of the links from the page
568         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
569         links = re.findall(LINK_RE, download_list_html)
570         if(len(links) == 0):
571             raise ExtractorError(u'ERROR: no known formats available for video')
572
573         self.to_screen(u'Links found: %d' % len(links))
574
575         formats = []
576         for link in links:
577
578             # A link looks like this:
579             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
580             # A path looks like this:
581             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
582             video_url = unescapeHTML( link )
583             path = compat_urllib_parse_urlparse( video_url ).path
584             extension = os.path.splitext( path )[1][1:]
585             format = path.split('/')[4].split('_')[:2]
586             size = format[0]
587             bitrate = format[1]
588             format = "-".join( format )
589             # title = u'%s-%s-%s' % (video_title, size, bitrate)
590
591             formats.append({
592                 'id': video_id,
593                 'url': video_url,
594                 'uploader': video_uploader,
595                 'upload_date': upload_date,
596                 'title': video_title,
597                 'ext': extension,
598                 'format': format,
599                 'thumbnail': thumbnail,
600                 'description': video_description
601             })
602
603         if self._downloader.params.get('listformats', None):
604             self._print_formats(formats)
605             return
606
607         req_format = self._downloader.params.get('format', None)
608         self.to_screen(u'Format: %s' % req_format)
609
610         if req_format is None or req_format == 'best':
611             return [formats[0]]
612         elif req_format == 'worst':
613             return [formats[-1]]
614         elif req_format in ('-1', 'all'):
615             return formats
616         else:
617             format = self._specific( req_format, formats )
618             if result is None:
619                 raise ExtractorError(u'Requested format not available')
620             return [format]
621
622
623
624 class PornotubeIE(InfoExtractor):
625     """Information extractor for pornotube.com."""
626     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
627
628     def _real_extract(self, url):
629         mobj = re.match(self._VALID_URL, url)
630         if mobj is None:
631             raise ExtractorError(u'Invalid URL: %s' % url)
632
633         video_id = mobj.group('videoid')
634         video_title = mobj.group('title')
635
636         # Get webpage content
637         webpage = self._download_webpage(url, video_id)
638
639         # Get the video URL
640         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
641         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
642         video_url = compat_urllib_parse.unquote(video_url)
643
644         #Get the uploaded date
645         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
646         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
647         if upload_date: upload_date = unified_strdate(upload_date)
648
649         info = {'id': video_id,
650                 'url': video_url,
651                 'uploader': None,
652                 'upload_date': upload_date,
653                 'title': video_title,
654                 'ext': 'flv',
655                 'format': 'flv'}
656
657         return [info]
658
659 class YouJizzIE(InfoExtractor):
660     """Information extractor for youjizz.com."""
661     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
662
663     def _real_extract(self, url):
664         mobj = re.match(self._VALID_URL, url)
665         if mobj is None:
666             raise ExtractorError(u'Invalid URL: %s' % url)
667
668         video_id = mobj.group('videoid')
669
670         # Get webpage content
671         webpage = self._download_webpage(url, video_id)
672
673         # Get the video title
674         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
675             webpage, u'title').strip()
676
677         # Get the embed page
678         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
679         if result is None:
680             raise ExtractorError(u'ERROR: unable to extract embed page')
681
682         embed_page_url = result.group(0).strip()
683         video_id = result.group('videoid')
684
685         webpage = self._download_webpage(embed_page_url, video_id)
686
687         # Get the video URL
688         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
689             webpage, u'video URL')
690
691         info = {'id': video_id,
692                 'url': video_url,
693                 'title': video_title,
694                 'ext': 'flv',
695                 'format': 'flv',
696                 'player_url': embed_page_url}
697
698         return [info]
699
700 class EightTracksIE(InfoExtractor):
701     IE_NAME = '8tracks'
702     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
703
704     def _real_extract(self, url):
705         mobj = re.match(self._VALID_URL, url)
706         if mobj is None:
707             raise ExtractorError(u'Invalid URL: %s' % url)
708         playlist_id = mobj.group('id')
709
710         webpage = self._download_webpage(url, playlist_id)
711
712         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
713         data = json.loads(json_like)
714
715         session = str(random.randint(0, 1000000000))
716         mix_id = data['id']
717         track_count = data['tracks_count']
718         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
719         next_url = first_url
720         res = []
721         for i in itertools.count():
722             api_json = self._download_webpage(next_url, playlist_id,
723                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
724                 errnote=u'Failed to download song information')
725             api_data = json.loads(api_json)
726             track_data = api_data[u'set']['track']
727             info = {
728                 'id': track_data['id'],
729                 'url': track_data['track_file_stream_url'],
730                 'title': track_data['performer'] + u' - ' + track_data['name'],
731                 'raw_title': track_data['name'],
732                 'uploader_id': data['user']['login'],
733                 'ext': 'm4a',
734             }
735             res.append(info)
736             if api_data['set']['at_last_track']:
737                 break
738             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
739         return res
740
741 class KeekIE(InfoExtractor):
742     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
743     IE_NAME = u'keek'
744
745     def _real_extract(self, url):
746         m = re.match(self._VALID_URL, url)
747         video_id = m.group('videoID')
748
749         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
750         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
751         webpage = self._download_webpage(url, video_id)
752
753         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
754             webpage, u'title')
755
756         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
757             webpage, u'uploader', fatal=False)
758
759         info = {
760                 'id': video_id,
761                 'url': video_url,
762                 'ext': 'mp4',
763                 'title': video_title,
764                 'thumbnail': thumbnail,
765                 'uploader': uploader
766         }
767         return [info]
768
769
770 class MySpassIE(InfoExtractor):
771     _VALID_URL = r'http://www.myspass.de/.*'
772
773     def _real_extract(self, url):
774         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
775
776         # video id is the last path element of the URL
777         # usually there is a trailing slash, so also try the second but last
778         url_path = compat_urllib_parse_urlparse(url).path
779         url_parent_path, video_id = os.path.split(url_path)
780         if not video_id:
781             _, video_id = os.path.split(url_parent_path)
782
783         # get metadata
784         metadata_url = META_DATA_URL_TEMPLATE % video_id
785         metadata_text = self._download_webpage(metadata_url, video_id)
786         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
787
788         # extract values from metadata
789         url_flv_el = metadata.find('url_flv')
790         if url_flv_el is None:
791             raise ExtractorError(u'Unable to extract download url')
792         video_url = url_flv_el.text
793         extension = os.path.splitext(video_url)[1][1:]
794         title_el = metadata.find('title')
795         if title_el is None:
796             raise ExtractorError(u'Unable to extract title')
797         title = title_el.text
798         format_id_el = metadata.find('format_id')
799         if format_id_el is None:
800             format = ext
801         else:
802             format = format_id_el.text
803         description_el = metadata.find('description')
804         if description_el is not None:
805             description = description_el.text
806         else:
807             description = None
808         imagePreview_el = metadata.find('imagePreview')
809         if imagePreview_el is not None:
810             thumbnail = imagePreview_el.text
811         else:
812             thumbnail = None
813         info = {
814             'id': video_id,
815             'url': video_url,
816             'title': title,
817             'ext': extension,
818             'format': format,
819             'thumbnail': thumbnail,
820             'description': description
821         }
822         return [info]
823
824 class SpiegelIE(InfoExtractor):
825     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
826
827     def _real_extract(self, url):
828         m = re.match(self._VALID_URL, url)
829         video_id = m.group('videoID')
830
831         webpage = self._download_webpage(url, video_id)
832
833         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
834             webpage, u'title')
835
836         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
837         xml_code = self._download_webpage(xml_url, video_id,
838                     note=u'Downloading XML', errnote=u'Failed to download XML')
839
840         idoc = xml.etree.ElementTree.fromstring(xml_code)
841         last_type = idoc[-1]
842         filename = last_type.findall('./filename')[0].text
843         duration = float(last_type.findall('./duration')[0].text)
844
845         video_url = 'http://video2.spiegel.de/flash/' + filename
846         video_ext = filename.rpartition('.')[2]
847         info = {
848             'id': video_id,
849             'url': video_url,
850             'ext': video_ext,
851             'title': video_title,
852             'duration': duration,
853         }
854         return [info]
855
856 class LiveLeakIE(InfoExtractor):
857
858     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
859     IE_NAME = u'liveleak'
860
861     def _real_extract(self, url):
862         mobj = re.match(self._VALID_URL, url)
863         if mobj is None:
864             raise ExtractorError(u'Invalid URL: %s' % url)
865
866         video_id = mobj.group('video_id')
867
868         webpage = self._download_webpage(url, video_id)
869
870         video_url = self._search_regex(r'file: "(.*?)",',
871             webpage, u'video URL')
872
873         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
874             webpage, u'title').replace('LiveLeak.com -', '').strip()
875
876         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
877             webpage, u'description', fatal=False)
878
879         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
880             webpage, u'uploader', fatal=False)
881
882         info = {
883             'id':  video_id,
884             'url': video_url,
885             'ext': 'mp4',
886             'title': video_title,
887             'description': video_description,
888             'uploader': video_uploader
889         }
890
891         return [info]
892
893
894
895 class TumblrIE(InfoExtractor):
896     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
897
898     def _real_extract(self, url):
899         m_url = re.match(self._VALID_URL, url)
900         video_id = m_url.group('id')
901         blog = m_url.group('blog_name')
902
903         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
904         webpage = self._download_webpage(url, video_id)
905
906         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
907         video = re.search(re_video, webpage)
908         if video is None:
909            raise ExtractorError(u'Unable to extract video')
910         video_url = video.group('video_url')
911         ext = video.group('ext')
912
913         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
914             webpage, u'thumbnail', fatal=False)  # We pick the first poster
915         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
916
917         # The only place where you can get a title, it's not complete,
918         # but searching in other places doesn't work for all videos
919         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
920             webpage, u'title', flags=re.DOTALL)
921
922         return [{'id': video_id,
923                  'url': video_url,
924                  'title': video_title,
925                  'thumbnail': video_thumbnail,
926                  'ext': ext
927                  }]
928
929 class BandcampIE(InfoExtractor):
930     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
931
932     def _real_extract(self, url):
933         mobj = re.match(self._VALID_URL, url)
934         title = mobj.group('title')
935         webpage = self._download_webpage(url, title)
936         # We get the link to the free download page
937         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
938         if m_download is None:
939             raise ExtractorError(u'No free songs found')
940
941         download_link = m_download.group(1)
942         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
943                        webpage, re.MULTILINE|re.DOTALL).group('id')
944
945         download_webpage = self._download_webpage(download_link, id,
946                                                   'Downloading free downloads page')
947         # We get the dictionary of the track from some javascrip code
948         info = re.search(r'items: (.*?),$',
949                          download_webpage, re.MULTILINE).group(1)
950         info = json.loads(info)[0]
951         # We pick mp3-320 for now, until format selection can be easily implemented.
952         mp3_info = info[u'downloads'][u'mp3-320']
953         # If we try to use this url it says the link has expired
954         initial_url = mp3_info[u'url']
955         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
956         m_url = re.match(re_url, initial_url)
957         #We build the url we will use to get the final track url
958         # This url is build in Bandcamp in the script download_bunde_*.js
959         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
960         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
961         # If we could correctly generate the .rand field the url would be
962         #in the "download_url" key
963         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
964
965         track_info = {'id':id,
966                       'title' : info[u'title'],
967                       'ext' :   'mp3',
968                       'url' :   final_url,
969                       'thumbnail' : info[u'thumb_url'],
970                       'uploader' :  info[u'artist']
971                       }
972
973         return [track_info]
974
975 class RedTubeIE(InfoExtractor):
976     """Information Extractor for redtube"""
977     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
978
979     def _real_extract(self,url):
980         mobj = re.match(self._VALID_URL, url)
981         if mobj is None:
982             raise ExtractorError(u'Invalid URL: %s' % url)
983
984         video_id = mobj.group('id')
985         video_extension = 'mp4'        
986         webpage = self._download_webpage(url, video_id)
987
988         self.report_extraction(video_id)
989
990         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
991             webpage, u'video URL')
992
993         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
994             webpage, u'title')
995
996         return [{
997             'id':       video_id,
998             'url':      video_url,
999             'ext':      video_extension,
1000             'title':    video_title,
1001         }]
1002         
1003 class InaIE(InfoExtractor):
1004     """Information Extractor for Ina.fr"""
1005     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1006
1007     def _real_extract(self,url):
1008         mobj = re.match(self._VALID_URL, url)
1009
1010         video_id = mobj.group('id')
1011         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1012         video_extension = 'mp4'
1013         webpage = self._download_webpage(mrss_url, video_id)
1014
1015         self.report_extraction(video_id)
1016
1017         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1018             webpage, u'video URL')
1019
1020         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1021             webpage, u'title')
1022
1023         return [{
1024             'id':       video_id,
1025             'url':      video_url,
1026             'ext':      video_extension,
1027             'title':    video_title,
1028         }]
1029
1030 class HowcastIE(InfoExtractor):
1031     """Information Extractor for Howcast.com"""
1032     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1033
1034     def _real_extract(self, url):
1035         mobj = re.match(self._VALID_URL, url)
1036
1037         video_id = mobj.group('id')
1038         webpage_url = 'http://www.howcast.com/videos/' + video_id
1039         webpage = self._download_webpage(webpage_url, video_id)
1040
1041         self.report_extraction(video_id)
1042
1043         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1044             webpage, u'video URL')
1045
1046         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1047             webpage, u'title')
1048
1049         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1050             webpage, u'description', fatal=False)
1051
1052         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1053             webpage, u'thumbnail', fatal=False)
1054
1055         return [{
1056             'id':       video_id,
1057             'url':      video_url,
1058             'ext':      'mp4',
1059             'title':    video_title,
1060             'description': video_description,
1061             'thumbnail': thumbnail,
1062         }]
1063
1064 class VineIE(InfoExtractor):
1065     """Information Extractor for Vine.co"""
1066     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1067
1068     def _real_extract(self, url):
1069         mobj = re.match(self._VALID_URL, url)
1070
1071         video_id = mobj.group('id')
1072         webpage_url = 'https://vine.co/v/' + video_id
1073         webpage = self._download_webpage(webpage_url, video_id)
1074
1075         self.report_extraction(video_id)
1076
1077         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1078             webpage, u'video URL')
1079
1080         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1081             webpage, u'title')
1082
1083         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1084             webpage, u'thumbnail', fatal=False)
1085
1086         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1087             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1088
1089         return [{
1090             'id':        video_id,
1091             'url':       video_url,
1092             'ext':       'mp4',
1093             'title':     video_title,
1094             'thumbnail': thumbnail,
1095             'uploader':  uploader,
1096         }]
1097
1098 class FlickrIE(InfoExtractor):
1099     """Information Extractor for Flickr videos"""
1100     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1101
1102     def _real_extract(self, url):
1103         mobj = re.match(self._VALID_URL, url)
1104
1105         video_id = mobj.group('id')
1106         video_uploader_id = mobj.group('uploader_id')
1107         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1108         webpage = self._download_webpage(webpage_url, video_id)
1109
1110         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1111
1112         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1113         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1114
1115         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1116             first_xml, u'node_id')
1117
1118         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1119         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1120
1121         self.report_extraction(video_id)
1122
1123         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1124         if mobj is None:
1125             raise ExtractorError(u'Unable to extract video url')
1126         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1127
1128         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1129             webpage, u'video title')
1130
1131         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1132             webpage, u'description', fatal=False)
1133
1134         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1135             webpage, u'thumbnail', fatal=False)
1136
1137         return [{
1138             'id':          video_id,
1139             'url':         video_url,
1140             'ext':         'mp4',
1141             'title':       video_title,
1142             'description': video_description,
1143             'thumbnail':   thumbnail,
1144             'uploader_id': video_uploader_id,
1145         }]
1146
1147 class TeamcocoIE(InfoExtractor):
1148     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1149
1150     def _real_extract(self, url):
1151         mobj = re.match(self._VALID_URL, url)
1152         if mobj is None:
1153             raise ExtractorError(u'Invalid URL: %s' % url)
1154         url_title = mobj.group('url_title')
1155         webpage = self._download_webpage(url, url_title)
1156
1157         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1158             webpage, u'video id')
1159
1160         self.report_extraction(video_id)
1161
1162         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1163             webpage, u'title')
1164
1165         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1166             webpage, u'thumbnail', fatal=False)
1167
1168         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1169             webpage, u'description', fatal=False)
1170
1171         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1172         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1173
1174         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1175             data, u'video URL')
1176
1177         return [{
1178             'id':          video_id,
1179             'url':         video_url,
1180             'ext':         'mp4',
1181             'title':       video_title,
1182             'thumbnail':   thumbnail,
1183             'description': video_description,
1184         }]
1185
1186 class XHamsterIE(InfoExtractor):
1187     """Information Extractor for xHamster"""
1188     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1189
1190     def _real_extract(self,url):
1191         mobj = re.match(self._VALID_URL, url)
1192
1193         video_id = mobj.group('id')
1194         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1195         webpage = self._download_webpage(mrss_url, video_id)
1196
1197         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1198         if mobj is None:
1199             raise ExtractorError(u'Unable to extract media URL')
1200         if len(mobj.group('server')) == 0:
1201             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1202         else:
1203             video_url = mobj.group('server')+'/key='+mobj.group('file')
1204         video_extension = video_url.split('.')[-1]
1205
1206         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1207             webpage, u'title')
1208
1209         # Can't see the description anywhere in the UI
1210         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1211         #     webpage, u'description', fatal=False)
1212         # if video_description: video_description = unescapeHTML(video_description)
1213
1214         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1215         if mobj:
1216             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1217         else:
1218             video_upload_date = None
1219             self._downloader.report_warning(u'Unable to extract upload date')
1220
1221         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1222             webpage, u'uploader id', default=u'anonymous')
1223
1224         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1225             webpage, u'thumbnail', fatal=False)
1226
1227         return [{
1228             'id':       video_id,
1229             'url':      video_url,
1230             'ext':      video_extension,
1231             'title':    video_title,
1232             # 'description': video_description,
1233             'upload_date': video_upload_date,
1234             'uploader_id': video_uploader_id,
1235             'thumbnail': video_thumbnail
1236         }]
1237
1238 class HypemIE(InfoExtractor):
1239     """Information Extractor for hypem"""
1240     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1241
1242     def _real_extract(self, url):
1243         mobj = re.match(self._VALID_URL, url)
1244         if mobj is None:
1245             raise ExtractorError(u'Invalid URL: %s' % url)
1246         track_id = mobj.group(1)
1247
1248         data = { 'ax': 1, 'ts': time.time() }
1249         data_encoded = compat_urllib_parse.urlencode(data)
1250         complete_url = url + "?" + data_encoded
1251         request = compat_urllib_request.Request(complete_url)
1252         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1253         cookie = urlh.headers.get('Set-Cookie', '')
1254
1255         self.report_extraction(track_id)
1256
1257         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1258             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1259         try:
1260             track_list = json.loads(html_tracks)
1261             track = track_list[u'tracks'][0]
1262         except ValueError:
1263             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1264
1265         key = track[u"key"]
1266         track_id = track[u"id"]
1267         artist = track[u"artist"]
1268         title = track[u"song"]
1269
1270         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1271         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1272         request.add_header('cookie', cookie)
1273         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1274         try:
1275             song_data = json.loads(song_data_json)
1276         except ValueError:
1277             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1278         final_url = song_data[u"url"]
1279
1280         return [{
1281             'id':       track_id,
1282             'url':      final_url,
1283             'ext':      "mp3",
1284             'title':    title,
1285             'artist':   artist,
1286         }]
1287
1288 class Vbox7IE(InfoExtractor):
1289     """Information Extractor for Vbox7"""
1290     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1291
1292     def _real_extract(self,url):
1293         mobj = re.match(self._VALID_URL, url)
1294         if mobj is None:
1295             raise ExtractorError(u'Invalid URL: %s' % url)
1296         video_id = mobj.group(1)
1297
1298         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1299         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1300         redirect_url = urlh.geturl() + new_location
1301         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1302
1303         title = self._html_search_regex(r'<title>(.*)</title>',
1304             webpage, u'title').split('/')[0].strip()
1305
1306         ext = "flv"
1307         info_url = "http://vbox7.com/play/magare.do"
1308         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1309         info_request = compat_urllib_request.Request(info_url, data)
1310         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1311         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1312         if info_response is None:
1313             raise ExtractorError(u'Unable to extract the media url')
1314         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1315
1316         return [{
1317             'id':        video_id,
1318             'url':       final_url,
1319             'ext':       ext,
1320             'title':     title,
1321             'thumbnail': thumbnail_url,
1322         }]
1323
1324
1325 def gen_extractors():
1326     """ Return a list of an instance of every supported extractor.
1327     The order does matter; the first extractor matched is the one handling the URL.
1328     """
1329     return [
1330         YoutubePlaylistIE(),
1331         YoutubeChannelIE(),
1332         YoutubeUserIE(),
1333         YoutubeSearchIE(),
1334         YoutubeIE(),
1335         MetacafeIE(),
1336         DailymotionIE(),
1337         GoogleSearchIE(),
1338         PhotobucketIE(),
1339         YahooIE(),
1340         YahooSearchIE(),
1341         DepositFilesIE(),
1342         FacebookIE(),
1343         BlipTVIE(),
1344         BlipTVUserIE(),
1345         VimeoIE(),
1346         MyVideoIE(),
1347         ComedyCentralIE(),
1348         EscapistIE(),
1349         CollegeHumorIE(),
1350         XVideosIE(),
1351         SoundcloudSetIE(),
1352         SoundcloudIE(),
1353         InfoQIE(),
1354         MixcloudIE(),
1355         StanfordOpenClassroomIE(),
1356         MTVIE(),
1357         YoukuIE(),
1358         XNXXIE(),
1359         YouJizzIE(),
1360         PornotubeIE(),
1361         YouPornIE(),
1362         GooglePlusIE(),
1363         ArteTvIE(),
1364         NBAIE(),
1365         WorldStarHipHopIE(),
1366         JustinTVIE(),
1367         FunnyOrDieIE(),
1368         SteamIE(),
1369         UstreamIE(),
1370         RBMARadioIE(),
1371         EightTracksIE(),
1372         KeekIE(),
1373         TEDIE(),
1374         MySpassIE(),
1375         SpiegelIE(),
1376         LiveLeakIE(),
1377         ARDIE(),
1378         ZDFIE(),
1379         TumblrIE(),
1380         BandcampIE(),
1381         RedTubeIE(),
1382         InaIE(),
1383         HowcastIE(),
1384         VineIE(),
1385         FlickrIE(),
1386         TeamcocoIE(),
1387         XHamsterIE(),
1388         HypemIE(),
1389         Vbox7IE(),
1390         GametrailersIE(),
1391         StatigramIE(),
1392         GenericIE()
1393     ]
1394
1395 def get_info_extractor(ie_name):
1396     """Returns the info extractor class with the given ie_name"""
1397     return globals()[ie_name+'IE']