Move DepositFiles into its own IE
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.depositfiles import DepositFilesIE
27 from .extractor.facebook import FacebookIE
28 from .extractor.gametrailers import GametrailersIE
29 from .extractor.generic import GenericIE
30 from .extractor.googleplus import GooglePlusIE
31 from .extractor.googlesearch import GoogleSearchIE
32 from .extractor.metacafe import MetacafeIE
33 from .extractor.myvideo import MyVideoIE
34 from .extractor.statigram import StatigramIE
35 from .extractor.photobucket import PhotobucketIE
36 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
37 from .extractor.vimeo import VimeoIE
38 from .extractor.yahoo import YahooIE, YahooSearchIE
39 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
40 from .extractor.zdf import ZDFIE
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 class EscapistIE(InfoExtractor):
70     """Information extractor for The Escapist """
71
72     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
73     IE_NAME = u'escapist'
74
75     def _real_extract(self, url):
76         mobj = re.match(self._VALID_URL, url)
77         if mobj is None:
78             raise ExtractorError(u'Invalid URL: %s' % url)
79         showName = mobj.group('showname')
80         videoId = mobj.group('episode')
81
82         self.report_extraction(videoId)
83         webpage = self._download_webpage(url, videoId)
84
85         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
86             webpage, u'description', fatal=False)
87
88         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
89             webpage, u'thumbnail', fatal=False)
90
91         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
92             webpage, u'player url')
93
94         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
95             webpage, u'player url').split(' : ')[-1]
96
97         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
98         configUrl = compat_urllib_parse.unquote(configUrl)
99
100         configJSON = self._download_webpage(configUrl, videoId,
101                                             u'Downloading configuration',
102                                             u'unable to download configuration')
103
104         # Technically, it's JavaScript, not JSON
105         configJSON = configJSON.replace("'", '"')
106
107         try:
108             config = json.loads(configJSON)
109         except (ValueError,) as err:
110             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
111
112         playlist = config['playlist']
113         videoUrl = playlist[1]['url']
114
115         info = {
116             'id': videoId,
117             'url': videoUrl,
118             'uploader': showName,
119             'upload_date': None,
120             'title': title,
121             'ext': 'mp4',
122             'thumbnail': imgUrl,
123             'description': videoDesc,
124             'player_url': playerUrl,
125         }
126
127         return [info]
128
129 class CollegeHumorIE(InfoExtractor):
130     """Information extractor for collegehumor.com"""
131
132     _WORKING = False
133     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
134     IE_NAME = u'collegehumor'
135
136     def report_manifest(self, video_id):
137         """Report information extraction."""
138         self.to_screen(u'%s: Downloading XML manifest' % video_id)
139
140     def _real_extract(self, url):
141         mobj = re.match(self._VALID_URL, url)
142         if mobj is None:
143             raise ExtractorError(u'Invalid URL: %s' % url)
144         video_id = mobj.group('videoid')
145
146         info = {
147             'id': video_id,
148             'uploader': None,
149             'upload_date': None,
150         }
151
152         self.report_extraction(video_id)
153         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
154         try:
155             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
157             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
158
159         mdoc = xml.etree.ElementTree.fromstring(metaXml)
160         try:
161             videoNode = mdoc.findall('./video')[0]
162             info['description'] = videoNode.findall('./description')[0].text
163             info['title'] = videoNode.findall('./caption')[0].text
164             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
165             manifest_url = videoNode.findall('./file')[0].text
166         except IndexError:
167             raise ExtractorError(u'Invalid metadata XML file')
168
169         manifest_url += '?hdcore=2.10.3'
170         self.report_manifest(video_id)
171         try:
172             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
174             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
175
176         adoc = xml.etree.ElementTree.fromstring(manifestXml)
177         try:
178             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
179             node_id = media_node.attrib['url']
180             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
181         except IndexError as err:
182             raise ExtractorError(u'Invalid manifest file')
183
184         url_pr = compat_urllib_parse_urlparse(manifest_url)
185         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
186
187         info['url'] = url
188         info['ext'] = 'f4f'
189         return [info]
190
191
192 class XVideosIE(InfoExtractor):
193     """Information extractor for xvideos.com"""
194
195     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
196     IE_NAME = u'xvideos'
197
198     def _real_extract(self, url):
199         mobj = re.match(self._VALID_URL, url)
200         if mobj is None:
201             raise ExtractorError(u'Invalid URL: %s' % url)
202         video_id = mobj.group(1)
203
204         webpage = self._download_webpage(url, video_id)
205
206         self.report_extraction(video_id)
207
208         # Extract video URL
209         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
210             webpage, u'video URL'))
211
212         # Extract title
213         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
214             webpage, u'title')
215
216         # Extract video thumbnail
217         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
218             webpage, u'thumbnail', fatal=False)
219
220         info = {
221             'id': video_id,
222             'url': video_url,
223             'uploader': None,
224             'upload_date': None,
225             'title': video_title,
226             'ext': 'flv',
227             'thumbnail': video_thumbnail,
228             'description': None,
229         }
230
231         return [info]
232
233
234
235
236 class InfoQIE(InfoExtractor):
237     """Information extractor for infoq.com"""
238     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
239
240     def _real_extract(self, url):
241         mobj = re.match(self._VALID_URL, url)
242         if mobj is None:
243             raise ExtractorError(u'Invalid URL: %s' % url)
244
245         webpage = self._download_webpage(url, video_id=url)
246         self.report_extraction(url)
247
248         # Extract video URL
249         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
250         if mobj is None:
251             raise ExtractorError(u'Unable to extract video url')
252         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
253         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
254
255         # Extract title
256         video_title = self._search_regex(r'contentTitle = "(.*?)";',
257             webpage, u'title')
258
259         # Extract description
260         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
261             webpage, u'description', fatal=False)
262
263         video_filename = video_url.split('/')[-1]
264         video_id, extension = video_filename.split('.')
265
266         info = {
267             'id': video_id,
268             'url': video_url,
269             'uploader': None,
270             'upload_date': None,
271             'title': video_title,
272             'ext': extension, # Extension is always(?) mp4, but seems to be flv
273             'thumbnail': None,
274             'description': video_description,
275         }
276
277         return [info]
278
279 class MixcloudIE(InfoExtractor):
280     """Information extractor for www.mixcloud.com"""
281
282     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
283     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
284     IE_NAME = u'mixcloud'
285
286     def report_download_json(self, file_id):
287         """Report JSON download."""
288         self.to_screen(u'Downloading json')
289
290     def get_urls(self, jsonData, fmt, bitrate='best'):
291         """Get urls from 'audio_formats' section in json"""
292         file_url = None
293         try:
294             bitrate_list = jsonData[fmt]
295             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
296                 bitrate = max(bitrate_list) # select highest
297
298             url_list = jsonData[fmt][bitrate]
299         except TypeError: # we have no bitrate info.
300             url_list = jsonData[fmt]
301         return url_list
302
303     def check_urls(self, url_list):
304         """Returns 1st active url from list"""
305         for url in url_list:
306             try:
307                 compat_urllib_request.urlopen(url)
308                 return url
309             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310                 url = None
311
312         return None
313
314     def _print_formats(self, formats):
315         print('Available formats:')
316         for fmt in formats.keys():
317             for b in formats[fmt]:
318                 try:
319                     ext = formats[fmt][b][0]
320                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
321                 except TypeError: # we have no bitrate info
322                     ext = formats[fmt][0]
323                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
324                     break
325
326     def _real_extract(self, url):
327         mobj = re.match(self._VALID_URL, url)
328         if mobj is None:
329             raise ExtractorError(u'Invalid URL: %s' % url)
330         # extract uploader & filename from url
331         uploader = mobj.group(1).decode('utf-8')
332         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
333
334         # construct API request
335         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
336         # retrieve .json file with links to files
337         request = compat_urllib_request.Request(file_url)
338         try:
339             self.report_download_json(file_url)
340             jsonData = compat_urllib_request.urlopen(request).read()
341         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
342             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
343
344         # parse JSON
345         json_data = json.loads(jsonData)
346         player_url = json_data['player_swf_url']
347         formats = dict(json_data['audio_formats'])
348
349         req_format = self._downloader.params.get('format', None)
350         bitrate = None
351
352         if self._downloader.params.get('listformats', None):
353             self._print_formats(formats)
354             return
355
356         if req_format is None or req_format == 'best':
357             for format_param in formats.keys():
358                 url_list = self.get_urls(formats, format_param)
359                 # check urls
360                 file_url = self.check_urls(url_list)
361                 if file_url is not None:
362                     break # got it!
363         else:
364             if req_format not in formats:
365                 raise ExtractorError(u'Format is not available')
366
367             url_list = self.get_urls(formats, req_format)
368             file_url = self.check_urls(url_list)
369             format_param = req_format
370
371         return [{
372             'id': file_id.decode('utf-8'),
373             'url': file_url.decode('utf-8'),
374             'uploader': uploader.decode('utf-8'),
375             'upload_date': None,
376             'title': json_data['name'],
377             'ext': file_url.split('.')[-1].decode('utf-8'),
378             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
379             'thumbnail': json_data['thumbnail_url'],
380             'description': json_data['description'],
381             'player_url': player_url.decode('utf-8'),
382         }]
383
384 class StanfordOpenClassroomIE(InfoExtractor):
385     """Information extractor for Stanford's Open ClassRoom"""
386
387     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
388     IE_NAME = u'stanfordoc'
389
390     def _real_extract(self, url):
391         mobj = re.match(self._VALID_URL, url)
392         if mobj is None:
393             raise ExtractorError(u'Invalid URL: %s' % url)
394
395         if mobj.group('course') and mobj.group('video'): # A specific video
396             course = mobj.group('course')
397             video = mobj.group('video')
398             info = {
399                 'id': course + '_' + video,
400                 'uploader': None,
401                 'upload_date': None,
402             }
403
404             self.report_extraction(info['id'])
405             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
406             xmlUrl = baseUrl + video + '.xml'
407             try:
408                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
409             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
411             mdoc = xml.etree.ElementTree.fromstring(metaXml)
412             try:
413                 info['title'] = mdoc.findall('./title')[0].text
414                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
415             except IndexError:
416                 raise ExtractorError(u'Invalid metadata XML file')
417             info['ext'] = info['url'].rpartition('.')[2]
418             return [info]
419         elif mobj.group('course'): # A course page
420             course = mobj.group('course')
421             info = {
422                 'id': course,
423                 'type': 'playlist',
424                 'uploader': None,
425                 'upload_date': None,
426             }
427
428             coursepage = self._download_webpage(url, info['id'],
429                                         note='Downloading course info page',
430                                         errnote='Unable to download course info page')
431
432             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
433
434             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
435                 coursepage, u'description', fatal=False)
436
437             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
438             info['list'] = [
439                 {
440                     'type': 'reference',
441                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
442                 }
443                     for vpage in links]
444             results = []
445             for entry in info['list']:
446                 assert entry['type'] == 'reference'
447                 results += self.extract(entry['url'])
448             return results
449         else: # Root page
450             info = {
451                 'id': 'Stanford OpenClassroom',
452                 'type': 'playlist',
453                 'uploader': None,
454                 'upload_date': None,
455             }
456
457             self.report_download_webpage(info['id'])
458             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
459             try:
460                 rootpage = compat_urllib_request.urlopen(rootURL).read()
461             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
463
464             info['title'] = info['id']
465
466             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
467             info['list'] = [
468                 {
469                     'type': 'reference',
470                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
471                 }
472                     for cpage in links]
473
474             results = []
475             for entry in info['list']:
476                 assert entry['type'] == 'reference'
477                 results += self.extract(entry['url'])
478             return results
479
480 class MTVIE(InfoExtractor):
481     """Information extractor for MTV.com"""
482
483     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
484     IE_NAME = u'mtv'
485
486     def _real_extract(self, url):
487         mobj = re.match(self._VALID_URL, url)
488         if mobj is None:
489             raise ExtractorError(u'Invalid URL: %s' % url)
490         if not mobj.group('proto'):
491             url = 'http://' + url
492         video_id = mobj.group('videoid')
493
494         webpage = self._download_webpage(url, video_id)
495
496         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
497             webpage, u'song name', fatal=False)
498
499         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
500             webpage, u'title')
501
502         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
503             webpage, u'mtvn_uri', fatal=False)
504
505         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
506             webpage, u'content id', fatal=False)
507
508         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
509         self.report_extraction(video_id)
510         request = compat_urllib_request.Request(videogen_url)
511         try:
512             metadataXml = compat_urllib_request.urlopen(request).read()
513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
514             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
515
516         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
517         renditions = mdoc.findall('.//rendition')
518
519         # For now, always pick the highest quality.
520         rendition = renditions[-1]
521
522         try:
523             _,_,ext = rendition.attrib['type'].partition('/')
524             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
525             video_url = rendition.find('./src').text
526         except KeyError:
527             raise ExtractorError('Invalid rendition field.')
528
529         info = {
530             'id': video_id,
531             'url': video_url,
532             'uploader': performer,
533             'upload_date': None,
534             'title': video_title,
535             'ext': ext,
536             'format': format,
537         }
538
539         return [info]
540
541
542 class YoukuIE(InfoExtractor):
543     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
544
545     def _gen_sid(self):
546         nowTime = int(time.time() * 1000)
547         random1 = random.randint(1000,1998)
548         random2 = random.randint(1000,9999)
549
550         return "%d%d%d" %(nowTime,random1,random2)
551
552     def _get_file_ID_mix_string(self, seed):
553         mixed = []
554         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
555         seed = float(seed)
556         for i in range(len(source)):
557             seed  =  (seed * 211 + 30031 ) % 65536
558             index  =  math.floor(seed / 65536 * len(source) )
559             mixed.append(source[int(index)])
560             source.remove(source[int(index)])
561         #return ''.join(mixed)
562         return mixed
563
564     def _get_file_id(self, fileId, seed):
565         mixed = self._get_file_ID_mix_string(seed)
566         ids = fileId.split('*')
567         realId = []
568         for ch in ids:
569             if ch:
570                 realId.append(mixed[int(ch)])
571         return ''.join(realId)
572
573     def _real_extract(self, url):
574         mobj = re.match(self._VALID_URL, url)
575         if mobj is None:
576             raise ExtractorError(u'Invalid URL: %s' % url)
577         video_id = mobj.group('ID')
578
579         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
580
581         jsondata = self._download_webpage(info_url, video_id)
582
583         self.report_extraction(video_id)
584         try:
585             config = json.loads(jsondata)
586
587             video_title =  config['data'][0]['title']
588             seed = config['data'][0]['seed']
589
590             format = self._downloader.params.get('format', None)
591             supported_format = list(config['data'][0]['streamfileids'].keys())
592
593             if format is None or format == 'best':
594                 if 'hd2' in supported_format:
595                     format = 'hd2'
596                 else:
597                     format = 'flv'
598                 ext = u'flv'
599             elif format == 'worst':
600                 format = 'mp4'
601                 ext = u'mp4'
602             else:
603                 format = 'flv'
604                 ext = u'flv'
605
606
607             fileid = config['data'][0]['streamfileids'][format]
608             keys = [s['k'] for s in config['data'][0]['segs'][format]]
609         except (UnicodeDecodeError, ValueError, KeyError):
610             raise ExtractorError(u'Unable to extract info section')
611
612         files_info=[]
613         sid = self._gen_sid()
614         fileid = self._get_file_id(fileid, seed)
615
616         #column 8,9 of fileid represent the segment number
617         #fileid[7:9] should be changed
618         for index, key in enumerate(keys):
619
620             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
621             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
622
623             info = {
624                 'id': '%s_part%02d' % (video_id, index),
625                 'url': download_url,
626                 'uploader': None,
627                 'upload_date': None,
628                 'title': video_title,
629                 'ext': ext,
630             }
631             files_info.append(info)
632
633         return files_info
634
635
636 class XNXXIE(InfoExtractor):
637     """Information extractor for xnxx.com"""
638
639     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
640     IE_NAME = u'xnxx'
641     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
642     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
643     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
644
645     def _real_extract(self, url):
646         mobj = re.match(self._VALID_URL, url)
647         if mobj is None:
648             raise ExtractorError(u'Invalid URL: %s' % url)
649         video_id = mobj.group(1)
650
651         # Get webpage content
652         webpage = self._download_webpage(url, video_id)
653
654         video_url = self._search_regex(self.VIDEO_URL_RE,
655             webpage, u'video URL')
656         video_url = compat_urllib_parse.unquote(video_url)
657
658         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
659             webpage, u'title')
660
661         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
662             webpage, u'thumbnail', fatal=False)
663
664         return [{
665             'id': video_id,
666             'url': video_url,
667             'uploader': None,
668             'upload_date': None,
669             'title': video_title,
670             'ext': 'flv',
671             'thumbnail': video_thumbnail,
672             'description': None,
673         }]
674
675
676
677 class NBAIE(InfoExtractor):
678     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
679     IE_NAME = u'nba'
680
681     def _real_extract(self, url):
682         mobj = re.match(self._VALID_URL, url)
683         if mobj is None:
684             raise ExtractorError(u'Invalid URL: %s' % url)
685
686         video_id = mobj.group(1)
687
688         webpage = self._download_webpage(url, video_id)
689
690         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
691
692         shortened_video_id = video_id.rpartition('/')[2]
693         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
694             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
695
696         # It isn't there in the HTML it returns to us
697         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
698
699         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
700
701         info = {
702             'id': shortened_video_id,
703             'url': video_url,
704             'ext': 'mp4',
705             'title': title,
706             # 'uploader_date': uploader_date,
707             'description': description,
708         }
709         return [info]
710
711 class JustinTVIE(InfoExtractor):
712     """Information extractor for justin.tv and twitch.tv"""
713     # TODO: One broadcast may be split into multiple videos. The key
714     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
715     # starts at 1 and increases. Can we treat all parts as one video?
716
717     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
718         (?:
719             (?P<channelid>[^/]+)|
720             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
721             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
722         )
723         /?(?:\#.*)?$
724         """
725     _JUSTIN_PAGE_LIMIT = 100
726     IE_NAME = u'justin.tv'
727
728     def report_download_page(self, channel, offset):
729         """Report attempt to download a single page of videos."""
730         self.to_screen(u'%s: Downloading video information from %d to %d' %
731                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
732
733     # Return count of items, list of *valid* items
734     def _parse_page(self, url, video_id):
735         webpage = self._download_webpage(url, video_id,
736                                          u'Downloading video info JSON',
737                                          u'unable to download video info JSON')
738
739         response = json.loads(webpage)
740         if type(response) != list:
741             error_text = response.get('error', 'unknown error')
742             raise ExtractorError(u'Justin.tv API: %s' % error_text)
743         info = []
744         for clip in response:
745             video_url = clip['video_file_url']
746             if video_url:
747                 video_extension = os.path.splitext(video_url)[1][1:]
748                 video_date = re.sub('-', '', clip['start_time'][:10])
749                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
750                 video_id = clip['id']
751                 video_title = clip.get('title', video_id)
752                 info.append({
753                     'id': video_id,
754                     'url': video_url,
755                     'title': video_title,
756                     'uploader': clip.get('channel_name', video_uploader_id),
757                     'uploader_id': video_uploader_id,
758                     'upload_date': video_date,
759                     'ext': video_extension,
760                 })
761         return (len(response), info)
762
763     def _real_extract(self, url):
764         mobj = re.match(self._VALID_URL, url)
765         if mobj is None:
766             raise ExtractorError(u'invalid URL: %s' % url)
767
768         api_base = 'http://api.justin.tv'
769         paged = False
770         if mobj.group('channelid'):
771             paged = True
772             video_id = mobj.group('channelid')
773             api = api_base + '/channel/archives/%s.json' % video_id
774         elif mobj.group('chapterid'):
775             chapter_id = mobj.group('chapterid')
776
777             webpage = self._download_webpage(url, chapter_id)
778             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
779             if not m:
780                 raise ExtractorError(u'Cannot find archive of a chapter')
781             archive_id = m.group(1)
782
783             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
784             chapter_info_xml = self._download_webpage(api, chapter_id,
785                                              note=u'Downloading chapter information',
786                                              errnote=u'Chapter information download failed')
787             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
788             for a in doc.findall('.//archive'):
789                 if archive_id == a.find('./id').text:
790                     break
791             else:
792                 raise ExtractorError(u'Could not find chapter in chapter information')
793
794             video_url = a.find('./video_file_url').text
795             video_ext = video_url.rpartition('.')[2] or u'flv'
796
797             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
798             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
799                                    note='Downloading chapter metadata',
800                                    errnote='Download of chapter metadata failed')
801             chapter_info = json.loads(chapter_info_json)
802
803             bracket_start = int(doc.find('.//bracket_start').text)
804             bracket_end = int(doc.find('.//bracket_end').text)
805
806             # TODO determine start (and probably fix up file)
807             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
808             #video_url += u'?start=' + TODO:start_timestamp
809             # bracket_start is 13290, but we want 51670615
810             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
811                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
812
813             info = {
814                 'id': u'c' + chapter_id,
815                 'url': video_url,
816                 'ext': video_ext,
817                 'title': chapter_info['title'],
818                 'thumbnail': chapter_info['preview'],
819                 'description': chapter_info['description'],
820                 'uploader': chapter_info['channel']['display_name'],
821                 'uploader_id': chapter_info['channel']['name'],
822             }
823             return [info]
824         else:
825             video_id = mobj.group('videoid')
826             api = api_base + '/broadcast/by_archive/%s.json' % video_id
827
828         self.report_extraction(video_id)
829
830         info = []
831         offset = 0
832         limit = self._JUSTIN_PAGE_LIMIT
833         while True:
834             if paged:
835                 self.report_download_page(video_id, offset)
836             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
837             page_count, page_info = self._parse_page(page_url, video_id)
838             info.extend(page_info)
839             if not paged or page_count != limit:
840                 break
841             offset += limit
842         return info
843
844 class FunnyOrDieIE(InfoExtractor):
845     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
846
847     def _real_extract(self, url):
848         mobj = re.match(self._VALID_URL, url)
849         if mobj is None:
850             raise ExtractorError(u'invalid URL: %s' % url)
851
852         video_id = mobj.group('id')
853         webpage = self._download_webpage(url, video_id)
854
855         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
856             webpage, u'video URL', flags=re.DOTALL)
857
858         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
859             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
860
861         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
862             webpage, u'description', fatal=False, flags=re.DOTALL)
863
864         info = {
865             'id': video_id,
866             'url': video_url,
867             'ext': 'mp4',
868             'title': title,
869             'description': video_description,
870         }
871         return [info]
872
873 class SteamIE(InfoExtractor):
874     _VALID_URL = r"""http://store\.steampowered\.com/
875                 (agecheck/)?
876                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
877                 (?P<gameID>\d+)/?
878                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
879                 """
880     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
881     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
882
883     @classmethod
884     def suitable(cls, url):
885         """Receives a URL and returns True if suitable for this IE."""
886         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
887
888     def _real_extract(self, url):
889         m = re.match(self._VALID_URL, url, re.VERBOSE)
890         gameID = m.group('gameID')
891
892         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
893         webpage = self._download_webpage(videourl, gameID)
894
895         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
896             videourl = self._AGECHECK_TEMPLATE % gameID
897             self.report_age_confirmation()
898             webpage = self._download_webpage(videourl, gameID)
899
900         self.report_extraction(gameID)
901         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
902                                              webpage, 'game title')
903
904         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
905         mweb = re.finditer(urlRE, webpage)
906         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
907         titles = re.finditer(namesRE, webpage)
908         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
909         thumbs = re.finditer(thumbsRE, webpage)
910         videos = []
911         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
912             video_id = vid.group('videoID')
913             title = vtitle.group('videoName')
914             video_url = vid.group('videoURL')
915             video_thumb = thumb.group('thumbnail')
916             if not video_url:
917                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
918             info = {
919                 'id':video_id,
920                 'url':video_url,
921                 'ext': 'flv',
922                 'title': unescapeHTML(title),
923                 'thumbnail': video_thumb
924                   }
925             videos.append(info)
926         return [self.playlist_result(videos, gameID, game_title)]
927
928 class UstreamIE(InfoExtractor):
929     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
930     IE_NAME = u'ustream'
931
932     def _real_extract(self, url):
933         m = re.match(self._VALID_URL, url)
934         video_id = m.group('videoID')
935
936         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
937         webpage = self._download_webpage(url, video_id)
938
939         self.report_extraction(video_id)
940
941         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
942             webpage, u'title')
943
944         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
945             webpage, u'uploader', fatal=False, flags=re.DOTALL)
946
947         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
948             webpage, u'thumbnail', fatal=False)
949
950         info = {
951                 'id': video_id,
952                 'url': video_url,
953                 'ext': 'flv',
954                 'title': video_title,
955                 'uploader': uploader,
956                 'thumbnail': thumbnail,
957                }
958         return info
959
960 class WorldStarHipHopIE(InfoExtractor):
961     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
962     IE_NAME = u'WorldStarHipHop'
963
964     def _real_extract(self, url):
965         m = re.match(self._VALID_URL, url)
966         video_id = m.group('id')
967
968         webpage_src = self._download_webpage(url, video_id)
969
970         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
971             webpage_src, u'video URL')
972
973         if 'mp4' in video_url:
974             ext = 'mp4'
975         else:
976             ext = 'flv'
977
978         video_title = self._html_search_regex(r"<title>(.*)</title>",
979             webpage_src, u'title')
980
981         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
982         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
983             webpage_src, u'thumbnail', fatal=False)
984
985         if not thumbnail:
986             _title = r"""candytitles.*>(.*)</span>"""
987             mobj = re.search(_title, webpage_src)
988             if mobj is not None:
989                 video_title = mobj.group(1)
990
991         results = [{
992                     'id': video_id,
993                     'url' : video_url,
994                     'title' : video_title,
995                     'thumbnail' : thumbnail,
996                     'ext' : ext,
997                     }]
998         return results
999
1000 class RBMARadioIE(InfoExtractor):
1001     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1002
1003     def _real_extract(self, url):
1004         m = re.match(self._VALID_URL, url)
1005         video_id = m.group('videoID')
1006
1007         webpage = self._download_webpage(url, video_id)
1008
1009         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1010             webpage, u'json data', flags=re.MULTILINE)
1011
1012         try:
1013             data = json.loads(json_data)
1014         except ValueError as e:
1015             raise ExtractorError(u'Invalid JSON: ' + str(e))
1016
1017         video_url = data['akamai_url'] + '&cbr=256'
1018         url_parts = compat_urllib_parse_urlparse(video_url)
1019         video_ext = url_parts.path.rpartition('.')[2]
1020         info = {
1021                 'id': video_id,
1022                 'url': video_url,
1023                 'ext': video_ext,
1024                 'title': data['title'],
1025                 'description': data.get('teaser_text'),
1026                 'location': data.get('country_of_origin'),
1027                 'uploader': data.get('host', {}).get('name'),
1028                 'uploader_id': data.get('host', {}).get('slug'),
1029                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1030                 'duration': data.get('duration'),
1031         }
1032         return [info]
1033
1034
1035 class YouPornIE(InfoExtractor):
1036     """Information extractor for youporn.com."""
1037     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1038
1039     def _print_formats(self, formats):
1040         """Print all available formats"""
1041         print(u'Available formats:')
1042         print(u'ext\t\tformat')
1043         print(u'---------------------------------')
1044         for format in formats:
1045             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1046
1047     def _specific(self, req_format, formats):
1048         for x in formats:
1049             if(x["format"]==req_format):
1050                 return x
1051         return None
1052
1053     def _real_extract(self, url):
1054         mobj = re.match(self._VALID_URL, url)
1055         if mobj is None:
1056             raise ExtractorError(u'Invalid URL: %s' % url)
1057         video_id = mobj.group('videoid')
1058
1059         req = compat_urllib_request.Request(url)
1060         req.add_header('Cookie', 'age_verified=1')
1061         webpage = self._download_webpage(req, video_id)
1062
1063         # Get JSON parameters
1064         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1065         try:
1066             params = json.loads(json_params)
1067         except:
1068             raise ExtractorError(u'Invalid JSON')
1069
1070         self.report_extraction(video_id)
1071         try:
1072             video_title = params['title']
1073             upload_date = unified_strdate(params['release_date_f'])
1074             video_description = params['description']
1075             video_uploader = params['submitted_by']
1076             thumbnail = params['thumbnails'][0]['image']
1077         except KeyError:
1078             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1079
1080         # Get all of the formats available
1081         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1082         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1083             webpage, u'download list').strip()
1084
1085         # Get all of the links from the page
1086         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1087         links = re.findall(LINK_RE, download_list_html)
1088         if(len(links) == 0):
1089             raise ExtractorError(u'ERROR: no known formats available for video')
1090
1091         self.to_screen(u'Links found: %d' % len(links))
1092
1093         formats = []
1094         for link in links:
1095
1096             # A link looks like this:
1097             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1098             # A path looks like this:
1099             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1100             video_url = unescapeHTML( link )
1101             path = compat_urllib_parse_urlparse( video_url ).path
1102             extension = os.path.splitext( path )[1][1:]
1103             format = path.split('/')[4].split('_')[:2]
1104             size = format[0]
1105             bitrate = format[1]
1106             format = "-".join( format )
1107             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1108
1109             formats.append({
1110                 'id': video_id,
1111                 'url': video_url,
1112                 'uploader': video_uploader,
1113                 'upload_date': upload_date,
1114                 'title': video_title,
1115                 'ext': extension,
1116                 'format': format,
1117                 'thumbnail': thumbnail,
1118                 'description': video_description
1119             })
1120
1121         if self._downloader.params.get('listformats', None):
1122             self._print_formats(formats)
1123             return
1124
1125         req_format = self._downloader.params.get('format', None)
1126         self.to_screen(u'Format: %s' % req_format)
1127
1128         if req_format is None or req_format == 'best':
1129             return [formats[0]]
1130         elif req_format == 'worst':
1131             return [formats[-1]]
1132         elif req_format in ('-1', 'all'):
1133             return formats
1134         else:
1135             format = self._specific( req_format, formats )
1136             if result is None:
1137                 raise ExtractorError(u'Requested format not available')
1138             return [format]
1139
1140
1141
1142 class PornotubeIE(InfoExtractor):
1143     """Information extractor for pornotube.com."""
1144     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1145
1146     def _real_extract(self, url):
1147         mobj = re.match(self._VALID_URL, url)
1148         if mobj is None:
1149             raise ExtractorError(u'Invalid URL: %s' % url)
1150
1151         video_id = mobj.group('videoid')
1152         video_title = mobj.group('title')
1153
1154         # Get webpage content
1155         webpage = self._download_webpage(url, video_id)
1156
1157         # Get the video URL
1158         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1159         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1160         video_url = compat_urllib_parse.unquote(video_url)
1161
1162         #Get the uploaded date
1163         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1164         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1165         if upload_date: upload_date = unified_strdate(upload_date)
1166
1167         info = {'id': video_id,
1168                 'url': video_url,
1169                 'uploader': None,
1170                 'upload_date': upload_date,
1171                 'title': video_title,
1172                 'ext': 'flv',
1173                 'format': 'flv'}
1174
1175         return [info]
1176
1177 class YouJizzIE(InfoExtractor):
1178     """Information extractor for youjizz.com."""
1179     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1180
1181     def _real_extract(self, url):
1182         mobj = re.match(self._VALID_URL, url)
1183         if mobj is None:
1184             raise ExtractorError(u'Invalid URL: %s' % url)
1185
1186         video_id = mobj.group('videoid')
1187
1188         # Get webpage content
1189         webpage = self._download_webpage(url, video_id)
1190
1191         # Get the video title
1192         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1193             webpage, u'title').strip()
1194
1195         # Get the embed page
1196         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1197         if result is None:
1198             raise ExtractorError(u'ERROR: unable to extract embed page')
1199
1200         embed_page_url = result.group(0).strip()
1201         video_id = result.group('videoid')
1202
1203         webpage = self._download_webpage(embed_page_url, video_id)
1204
1205         # Get the video URL
1206         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1207             webpage, u'video URL')
1208
1209         info = {'id': video_id,
1210                 'url': video_url,
1211                 'title': video_title,
1212                 'ext': 'flv',
1213                 'format': 'flv',
1214                 'player_url': embed_page_url}
1215
1216         return [info]
1217
1218 class EightTracksIE(InfoExtractor):
1219     IE_NAME = '8tracks'
1220     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1221
1222     def _real_extract(self, url):
1223         mobj = re.match(self._VALID_URL, url)
1224         if mobj is None:
1225             raise ExtractorError(u'Invalid URL: %s' % url)
1226         playlist_id = mobj.group('id')
1227
1228         webpage = self._download_webpage(url, playlist_id)
1229
1230         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1231         data = json.loads(json_like)
1232
1233         session = str(random.randint(0, 1000000000))
1234         mix_id = data['id']
1235         track_count = data['tracks_count']
1236         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1237         next_url = first_url
1238         res = []
1239         for i in itertools.count():
1240             api_json = self._download_webpage(next_url, playlist_id,
1241                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1242                 errnote=u'Failed to download song information')
1243             api_data = json.loads(api_json)
1244             track_data = api_data[u'set']['track']
1245             info = {
1246                 'id': track_data['id'],
1247                 'url': track_data['track_file_stream_url'],
1248                 'title': track_data['performer'] + u' - ' + track_data['name'],
1249                 'raw_title': track_data['name'],
1250                 'uploader_id': data['user']['login'],
1251                 'ext': 'm4a',
1252             }
1253             res.append(info)
1254             if api_data['set']['at_last_track']:
1255                 break
1256             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1257         return res
1258
1259 class KeekIE(InfoExtractor):
1260     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1261     IE_NAME = u'keek'
1262
1263     def _real_extract(self, url):
1264         m = re.match(self._VALID_URL, url)
1265         video_id = m.group('videoID')
1266
1267         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1268         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1269         webpage = self._download_webpage(url, video_id)
1270
1271         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1272             webpage, u'title')
1273
1274         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1275             webpage, u'uploader', fatal=False)
1276
1277         info = {
1278                 'id': video_id,
1279                 'url': video_url,
1280                 'ext': 'mp4',
1281                 'title': video_title,
1282                 'thumbnail': thumbnail,
1283                 'uploader': uploader
1284         }
1285         return [info]
1286
1287 class TEDIE(InfoExtractor):
1288     _VALID_URL=r'''http://www\.ted\.com/
1289                    (
1290                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1291                         |
1292                         ((?P<type_talk>talks)) # We have a simple talk
1293                    )
1294                    (/lang/(.*?))? # The url may contain the language
1295                    /(?P<name>\w+) # Here goes the name and then ".html"
1296                    '''
1297
1298     @classmethod
1299     def suitable(cls, url):
1300         """Receives a URL and returns True if suitable for this IE."""
1301         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1302
1303     def _real_extract(self, url):
1304         m=re.match(self._VALID_URL, url, re.VERBOSE)
1305         if m.group('type_talk'):
1306             return [self._talk_info(url)]
1307         else :
1308             playlist_id=m.group('playlist_id')
1309             name=m.group('name')
1310             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1311             return [self._playlist_videos_info(url,name,playlist_id)]
1312
1313     def _playlist_videos_info(self,url,name,playlist_id=0):
1314         '''Returns the videos of the playlist'''
1315         video_RE=r'''
1316                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1317                      ([.\s]*?)data-playlist_item_id="(\d+)"
1318                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1319                      '''
1320         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1321         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1322         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1323         m_names=re.finditer(video_name_RE,webpage)
1324
1325         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1326                                                  webpage, 'playlist title')
1327
1328         playlist_entries = []
1329         for m_video, m_name in zip(m_videos,m_names):
1330             video_id=m_video.group('video_id')
1331             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1332             playlist_entries.append(self.url_result(talk_url, 'TED'))
1333         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1334
1335     def _talk_info(self, url, video_id=0):
1336         """Return the video for the talk in the url"""
1337         m = re.match(self._VALID_URL, url,re.VERBOSE)
1338         video_name = m.group('name')
1339         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1340         self.report_extraction(video_name)
1341         # If the url includes the language we get the title translated
1342         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1343                                         webpage, 'title')
1344         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1345                                     webpage, 'json data')
1346         info = json.loads(json_data)
1347         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1348                                        webpage, 'description', flags = re.DOTALL)
1349         
1350         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1351                                        webpage, 'thumbnail')
1352         info = {
1353                 'id': info['id'],
1354                 'url': info['htmlStreams'][-1]['file'],
1355                 'ext': 'mp4',
1356                 'title': title,
1357                 'thumbnail': thumbnail,
1358                 'description': desc,
1359                 }
1360         return info
1361
1362 class MySpassIE(InfoExtractor):
1363     _VALID_URL = r'http://www.myspass.de/.*'
1364
1365     def _real_extract(self, url):
1366         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1367
1368         # video id is the last path element of the URL
1369         # usually there is a trailing slash, so also try the second but last
1370         url_path = compat_urllib_parse_urlparse(url).path
1371         url_parent_path, video_id = os.path.split(url_path)
1372         if not video_id:
1373             _, video_id = os.path.split(url_parent_path)
1374
1375         # get metadata
1376         metadata_url = META_DATA_URL_TEMPLATE % video_id
1377         metadata_text = self._download_webpage(metadata_url, video_id)
1378         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1379
1380         # extract values from metadata
1381         url_flv_el = metadata.find('url_flv')
1382         if url_flv_el is None:
1383             raise ExtractorError(u'Unable to extract download url')
1384         video_url = url_flv_el.text
1385         extension = os.path.splitext(video_url)[1][1:]
1386         title_el = metadata.find('title')
1387         if title_el is None:
1388             raise ExtractorError(u'Unable to extract title')
1389         title = title_el.text
1390         format_id_el = metadata.find('format_id')
1391         if format_id_el is None:
1392             format = ext
1393         else:
1394             format = format_id_el.text
1395         description_el = metadata.find('description')
1396         if description_el is not None:
1397             description = description_el.text
1398         else:
1399             description = None
1400         imagePreview_el = metadata.find('imagePreview')
1401         if imagePreview_el is not None:
1402             thumbnail = imagePreview_el.text
1403         else:
1404             thumbnail = None
1405         info = {
1406             'id': video_id,
1407             'url': video_url,
1408             'title': title,
1409             'ext': extension,
1410             'format': format,
1411             'thumbnail': thumbnail,
1412             'description': description
1413         }
1414         return [info]
1415
1416 class SpiegelIE(InfoExtractor):
1417     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1418
1419     def _real_extract(self, url):
1420         m = re.match(self._VALID_URL, url)
1421         video_id = m.group('videoID')
1422
1423         webpage = self._download_webpage(url, video_id)
1424
1425         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1426             webpage, u'title')
1427
1428         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1429         xml_code = self._download_webpage(xml_url, video_id,
1430                     note=u'Downloading XML', errnote=u'Failed to download XML')
1431
1432         idoc = xml.etree.ElementTree.fromstring(xml_code)
1433         last_type = idoc[-1]
1434         filename = last_type.findall('./filename')[0].text
1435         duration = float(last_type.findall('./duration')[0].text)
1436
1437         video_url = 'http://video2.spiegel.de/flash/' + filename
1438         video_ext = filename.rpartition('.')[2]
1439         info = {
1440             'id': video_id,
1441             'url': video_url,
1442             'ext': video_ext,
1443             'title': video_title,
1444             'duration': duration,
1445         }
1446         return [info]
1447
1448 class LiveLeakIE(InfoExtractor):
1449
1450     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1451     IE_NAME = u'liveleak'
1452
1453     def _real_extract(self, url):
1454         mobj = re.match(self._VALID_URL, url)
1455         if mobj is None:
1456             raise ExtractorError(u'Invalid URL: %s' % url)
1457
1458         video_id = mobj.group('video_id')
1459
1460         webpage = self._download_webpage(url, video_id)
1461
1462         video_url = self._search_regex(r'file: "(.*?)",',
1463             webpage, u'video URL')
1464
1465         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1466             webpage, u'title').replace('LiveLeak.com -', '').strip()
1467
1468         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1469             webpage, u'description', fatal=False)
1470
1471         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1472             webpage, u'uploader', fatal=False)
1473
1474         info = {
1475             'id':  video_id,
1476             'url': video_url,
1477             'ext': 'mp4',
1478             'title': video_title,
1479             'description': video_description,
1480             'uploader': video_uploader
1481         }
1482
1483         return [info]
1484
1485
1486
1487 class TumblrIE(InfoExtractor):
1488     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1489
1490     def _real_extract(self, url):
1491         m_url = re.match(self._VALID_URL, url)
1492         video_id = m_url.group('id')
1493         blog = m_url.group('blog_name')
1494
1495         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1496         webpage = self._download_webpage(url, video_id)
1497
1498         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1499         video = re.search(re_video, webpage)
1500         if video is None:
1501            raise ExtractorError(u'Unable to extract video')
1502         video_url = video.group('video_url')
1503         ext = video.group('ext')
1504
1505         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1506             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1507         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1508
1509         # The only place where you can get a title, it's not complete,
1510         # but searching in other places doesn't work for all videos
1511         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1512             webpage, u'title', flags=re.DOTALL)
1513
1514         return [{'id': video_id,
1515                  'url': video_url,
1516                  'title': video_title,
1517                  'thumbnail': video_thumbnail,
1518                  'ext': ext
1519                  }]
1520
1521 class BandcampIE(InfoExtractor):
1522     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1523
1524     def _real_extract(self, url):
1525         mobj = re.match(self._VALID_URL, url)
1526         title = mobj.group('title')
1527         webpage = self._download_webpage(url, title)
1528         # We get the link to the free download page
1529         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1530         if m_download is None:
1531             raise ExtractorError(u'No free songs found')
1532
1533         download_link = m_download.group(1)
1534         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1535                        webpage, re.MULTILINE|re.DOTALL).group('id')
1536
1537         download_webpage = self._download_webpage(download_link, id,
1538                                                   'Downloading free downloads page')
1539         # We get the dictionary of the track from some javascrip code
1540         info = re.search(r'items: (.*?),$',
1541                          download_webpage, re.MULTILINE).group(1)
1542         info = json.loads(info)[0]
1543         # We pick mp3-320 for now, until format selection can be easily implemented.
1544         mp3_info = info[u'downloads'][u'mp3-320']
1545         # If we try to use this url it says the link has expired
1546         initial_url = mp3_info[u'url']
1547         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1548         m_url = re.match(re_url, initial_url)
1549         #We build the url we will use to get the final track url
1550         # This url is build in Bandcamp in the script download_bunde_*.js
1551         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1552         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1553         # If we could correctly generate the .rand field the url would be
1554         #in the "download_url" key
1555         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1556
1557         track_info = {'id':id,
1558                       'title' : info[u'title'],
1559                       'ext' :   'mp3',
1560                       'url' :   final_url,
1561                       'thumbnail' : info[u'thumb_url'],
1562                       'uploader' :  info[u'artist']
1563                       }
1564
1565         return [track_info]
1566
1567 class RedTubeIE(InfoExtractor):
1568     """Information Extractor for redtube"""
1569     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1570
1571     def _real_extract(self,url):
1572         mobj = re.match(self._VALID_URL, url)
1573         if mobj is None:
1574             raise ExtractorError(u'Invalid URL: %s' % url)
1575
1576         video_id = mobj.group('id')
1577         video_extension = 'mp4'        
1578         webpage = self._download_webpage(url, video_id)
1579
1580         self.report_extraction(video_id)
1581
1582         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1583             webpage, u'video URL')
1584
1585         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1586             webpage, u'title')
1587
1588         return [{
1589             'id':       video_id,
1590             'url':      video_url,
1591             'ext':      video_extension,
1592             'title':    video_title,
1593         }]
1594         
1595 class InaIE(InfoExtractor):
1596     """Information Extractor for Ina.fr"""
1597     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1598
1599     def _real_extract(self,url):
1600         mobj = re.match(self._VALID_URL, url)
1601
1602         video_id = mobj.group('id')
1603         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1604         video_extension = 'mp4'
1605         webpage = self._download_webpage(mrss_url, video_id)
1606
1607         self.report_extraction(video_id)
1608
1609         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1610             webpage, u'video URL')
1611
1612         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1613             webpage, u'title')
1614
1615         return [{
1616             'id':       video_id,
1617             'url':      video_url,
1618             'ext':      video_extension,
1619             'title':    video_title,
1620         }]
1621
1622 class HowcastIE(InfoExtractor):
1623     """Information Extractor for Howcast.com"""
1624     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1625
1626     def _real_extract(self, url):
1627         mobj = re.match(self._VALID_URL, url)
1628
1629         video_id = mobj.group('id')
1630         webpage_url = 'http://www.howcast.com/videos/' + video_id
1631         webpage = self._download_webpage(webpage_url, video_id)
1632
1633         self.report_extraction(video_id)
1634
1635         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1636             webpage, u'video URL')
1637
1638         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1639             webpage, u'title')
1640
1641         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1642             webpage, u'description', fatal=False)
1643
1644         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1645             webpage, u'thumbnail', fatal=False)
1646
1647         return [{
1648             'id':       video_id,
1649             'url':      video_url,
1650             'ext':      'mp4',
1651             'title':    video_title,
1652             'description': video_description,
1653             'thumbnail': thumbnail,
1654         }]
1655
1656 class VineIE(InfoExtractor):
1657     """Information Extractor for Vine.co"""
1658     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1659
1660     def _real_extract(self, url):
1661         mobj = re.match(self._VALID_URL, url)
1662
1663         video_id = mobj.group('id')
1664         webpage_url = 'https://vine.co/v/' + video_id
1665         webpage = self._download_webpage(webpage_url, video_id)
1666
1667         self.report_extraction(video_id)
1668
1669         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1670             webpage, u'video URL')
1671
1672         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1673             webpage, u'title')
1674
1675         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1676             webpage, u'thumbnail', fatal=False)
1677
1678         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1679             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1680
1681         return [{
1682             'id':        video_id,
1683             'url':       video_url,
1684             'ext':       'mp4',
1685             'title':     video_title,
1686             'thumbnail': thumbnail,
1687             'uploader':  uploader,
1688         }]
1689
1690 class FlickrIE(InfoExtractor):
1691     """Information Extractor for Flickr videos"""
1692     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1693
1694     def _real_extract(self, url):
1695         mobj = re.match(self._VALID_URL, url)
1696
1697         video_id = mobj.group('id')
1698         video_uploader_id = mobj.group('uploader_id')
1699         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1700         webpage = self._download_webpage(webpage_url, video_id)
1701
1702         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1703
1704         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1705         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1706
1707         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1708             first_xml, u'node_id')
1709
1710         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1711         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1712
1713         self.report_extraction(video_id)
1714
1715         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1716         if mobj is None:
1717             raise ExtractorError(u'Unable to extract video url')
1718         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1719
1720         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1721             webpage, u'video title')
1722
1723         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1724             webpage, u'description', fatal=False)
1725
1726         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1727             webpage, u'thumbnail', fatal=False)
1728
1729         return [{
1730             'id':          video_id,
1731             'url':         video_url,
1732             'ext':         'mp4',
1733             'title':       video_title,
1734             'description': video_description,
1735             'thumbnail':   thumbnail,
1736             'uploader_id': video_uploader_id,
1737         }]
1738
1739 class TeamcocoIE(InfoExtractor):
1740     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1741
1742     def _real_extract(self, url):
1743         mobj = re.match(self._VALID_URL, url)
1744         if mobj is None:
1745             raise ExtractorError(u'Invalid URL: %s' % url)
1746         url_title = mobj.group('url_title')
1747         webpage = self._download_webpage(url, url_title)
1748
1749         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1750             webpage, u'video id')
1751
1752         self.report_extraction(video_id)
1753
1754         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1755             webpage, u'title')
1756
1757         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1758             webpage, u'thumbnail', fatal=False)
1759
1760         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1761             webpage, u'description', fatal=False)
1762
1763         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1764         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1765
1766         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1767             data, u'video URL')
1768
1769         return [{
1770             'id':          video_id,
1771             'url':         video_url,
1772             'ext':         'mp4',
1773             'title':       video_title,
1774             'thumbnail':   thumbnail,
1775             'description': video_description,
1776         }]
1777
1778 class XHamsterIE(InfoExtractor):
1779     """Information Extractor for xHamster"""
1780     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1781
1782     def _real_extract(self,url):
1783         mobj = re.match(self._VALID_URL, url)
1784
1785         video_id = mobj.group('id')
1786         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1787         webpage = self._download_webpage(mrss_url, video_id)
1788
1789         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1790         if mobj is None:
1791             raise ExtractorError(u'Unable to extract media URL')
1792         if len(mobj.group('server')) == 0:
1793             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1794         else:
1795             video_url = mobj.group('server')+'/key='+mobj.group('file')
1796         video_extension = video_url.split('.')[-1]
1797
1798         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1799             webpage, u'title')
1800
1801         # Can't see the description anywhere in the UI
1802         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1803         #     webpage, u'description', fatal=False)
1804         # if video_description: video_description = unescapeHTML(video_description)
1805
1806         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1807         if mobj:
1808             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1809         else:
1810             video_upload_date = None
1811             self._downloader.report_warning(u'Unable to extract upload date')
1812
1813         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1814             webpage, u'uploader id', default=u'anonymous')
1815
1816         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1817             webpage, u'thumbnail', fatal=False)
1818
1819         return [{
1820             'id':       video_id,
1821             'url':      video_url,
1822             'ext':      video_extension,
1823             'title':    video_title,
1824             # 'description': video_description,
1825             'upload_date': video_upload_date,
1826             'uploader_id': video_uploader_id,
1827             'thumbnail': video_thumbnail
1828         }]
1829
1830 class HypemIE(InfoExtractor):
1831     """Information Extractor for hypem"""
1832     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1833
1834     def _real_extract(self, url):
1835         mobj = re.match(self._VALID_URL, url)
1836         if mobj is None:
1837             raise ExtractorError(u'Invalid URL: %s' % url)
1838         track_id = mobj.group(1)
1839
1840         data = { 'ax': 1, 'ts': time.time() }
1841         data_encoded = compat_urllib_parse.urlencode(data)
1842         complete_url = url + "?" + data_encoded
1843         request = compat_urllib_request.Request(complete_url)
1844         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1845         cookie = urlh.headers.get('Set-Cookie', '')
1846
1847         self.report_extraction(track_id)
1848
1849         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1850             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1851         try:
1852             track_list = json.loads(html_tracks)
1853             track = track_list[u'tracks'][0]
1854         except ValueError:
1855             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1856
1857         key = track[u"key"]
1858         track_id = track[u"id"]
1859         artist = track[u"artist"]
1860         title = track[u"song"]
1861
1862         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1863         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1864         request.add_header('cookie', cookie)
1865         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1866         try:
1867             song_data = json.loads(song_data_json)
1868         except ValueError:
1869             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1870         final_url = song_data[u"url"]
1871
1872         return [{
1873             'id':       track_id,
1874             'url':      final_url,
1875             'ext':      "mp3",
1876             'title':    title,
1877             'artist':   artist,
1878         }]
1879
1880 class Vbox7IE(InfoExtractor):
1881     """Information Extractor for Vbox7"""
1882     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1883
1884     def _real_extract(self,url):
1885         mobj = re.match(self._VALID_URL, url)
1886         if mobj is None:
1887             raise ExtractorError(u'Invalid URL: %s' % url)
1888         video_id = mobj.group(1)
1889
1890         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1891         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1892         redirect_url = urlh.geturl() + new_location
1893         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1894
1895         title = self._html_search_regex(r'<title>(.*)</title>',
1896             webpage, u'title').split('/')[0].strip()
1897
1898         ext = "flv"
1899         info_url = "http://vbox7.com/play/magare.do"
1900         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1901         info_request = compat_urllib_request.Request(info_url, data)
1902         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1903         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1904         if info_response is None:
1905             raise ExtractorError(u'Unable to extract the media url')
1906         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1907
1908         return [{
1909             'id':        video_id,
1910             'url':       final_url,
1911             'ext':       ext,
1912             'title':     title,
1913             'thumbnail': thumbnail_url,
1914         }]
1915
1916
1917 def gen_extractors():
1918     """ Return a list of an instance of every supported extractor.
1919     The order does matter; the first extractor matched is the one handling the URL.
1920     """
1921     return [
1922         YoutubePlaylistIE(),
1923         YoutubeChannelIE(),
1924         YoutubeUserIE(),
1925         YoutubeSearchIE(),
1926         YoutubeIE(),
1927         MetacafeIE(),
1928         DailymotionIE(),
1929         GoogleSearchIE(),
1930         PhotobucketIE(),
1931         YahooIE(),
1932         YahooSearchIE(),
1933         DepositFilesIE(),
1934         FacebookIE(),
1935         BlipTVIE(),
1936         BlipTVUserIE(),
1937         VimeoIE(),
1938         MyVideoIE(),
1939         ComedyCentralIE(),
1940         EscapistIE(),
1941         CollegeHumorIE(),
1942         XVideosIE(),
1943         SoundcloudSetIE(),
1944         SoundcloudIE(),
1945         InfoQIE(),
1946         MixcloudIE(),
1947         StanfordOpenClassroomIE(),
1948         MTVIE(),
1949         YoukuIE(),
1950         XNXXIE(),
1951         YouJizzIE(),
1952         PornotubeIE(),
1953         YouPornIE(),
1954         GooglePlusIE(),
1955         ArteTvIE(),
1956         NBAIE(),
1957         WorldStarHipHopIE(),
1958         JustinTVIE(),
1959         FunnyOrDieIE(),
1960         SteamIE(),
1961         UstreamIE(),
1962         RBMARadioIE(),
1963         EightTracksIE(),
1964         KeekIE(),
1965         TEDIE(),
1966         MySpassIE(),
1967         SpiegelIE(),
1968         LiveLeakIE(),
1969         ARDIE(),
1970         ZDFIE(),
1971         TumblrIE(),
1972         BandcampIE(),
1973         RedTubeIE(),
1974         InaIE(),
1975         HowcastIE(),
1976         VineIE(),
1977         FlickrIE(),
1978         TeamcocoIE(),
1979         XHamsterIE(),
1980         HypemIE(),
1981         Vbox7IE(),
1982         GametrailersIE(),
1983         StatigramIE(),
1984         GenericIE()
1985     ]
1986
1987 def get_info_extractor(ie_name):
1988     """Returns the info extractor class with the given ie_name"""
1989     return globals()[ie_name+'IE']