6c75d84e5512fd15640f4664c68a865c954cce4b
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.depositfiles import DepositFilesIE
27 from .extractor.facebook import FacebookIE
28 from .extractor.gametrailers import GametrailersIE
29 from .extractor.generic import GenericIE
30 from .extractor.googleplus import GooglePlusIE
31 from .extractor.googlesearch import GoogleSearchIE
32 from .extractor.metacafe import MetacafeIE
33 from .extractor.myvideo import MyVideoIE
34 from .extractor.statigram import StatigramIE
35 from .extractor.photobucket import PhotobucketIE
36 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
37 from .extractor.vimeo import VimeoIE
38 from .extractor.yahoo import YahooIE, YahooSearchIE
39 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
40 from .extractor.zdf import ZDFIE
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 class EscapistIE(InfoExtractor):
70     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
71
72     def _real_extract(self, url):
73         mobj = re.match(self._VALID_URL, url)
74         if mobj is None:
75             raise ExtractorError(u'Invalid URL: %s' % url)
76         showName = mobj.group('showname')
77         videoId = mobj.group('episode')
78
79         self.report_extraction(videoId)
80         webpage = self._download_webpage(url, videoId)
81
82         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
83             webpage, u'description', fatal=False)
84
85         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
86             webpage, u'thumbnail', fatal=False)
87
88         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
89             webpage, u'player url')
90
91         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
92             webpage, u'player url').split(' : ')[-1]
93
94         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
95         configUrl = compat_urllib_parse.unquote(configUrl)
96
97         configJSON = self._download_webpage(configUrl, videoId,
98                                             u'Downloading configuration',
99                                             u'unable to download configuration')
100
101         # Technically, it's JavaScript, not JSON
102         configJSON = configJSON.replace("'", '"')
103
104         try:
105             config = json.loads(configJSON)
106         except (ValueError,) as err:
107             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
108
109         playlist = config['playlist']
110         videoUrl = playlist[1]['url']
111
112         info = {
113             'id': videoId,
114             'url': videoUrl,
115             'uploader': showName,
116             'upload_date': None,
117             'title': title,
118             'ext': 'mp4',
119             'thumbnail': imgUrl,
120             'description': videoDesc,
121             'player_url': playerUrl,
122         }
123
124         return [info]
125
126 class CollegeHumorIE(InfoExtractor):
127     """Information extractor for collegehumor.com"""
128
129     _WORKING = False
130     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
131     IE_NAME = u'collegehumor'
132
133     def report_manifest(self, video_id):
134         """Report information extraction."""
135         self.to_screen(u'%s: Downloading XML manifest' % video_id)
136
137     def _real_extract(self, url):
138         mobj = re.match(self._VALID_URL, url)
139         if mobj is None:
140             raise ExtractorError(u'Invalid URL: %s' % url)
141         video_id = mobj.group('videoid')
142
143         info = {
144             'id': video_id,
145             'uploader': None,
146             'upload_date': None,
147         }
148
149         self.report_extraction(video_id)
150         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
151         try:
152             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
153         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
154             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
155
156         mdoc = xml.etree.ElementTree.fromstring(metaXml)
157         try:
158             videoNode = mdoc.findall('./video')[0]
159             info['description'] = videoNode.findall('./description')[0].text
160             info['title'] = videoNode.findall('./caption')[0].text
161             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
162             manifest_url = videoNode.findall('./file')[0].text
163         except IndexError:
164             raise ExtractorError(u'Invalid metadata XML file')
165
166         manifest_url += '?hdcore=2.10.3'
167         self.report_manifest(video_id)
168         try:
169             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
171             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
172
173         adoc = xml.etree.ElementTree.fromstring(manifestXml)
174         try:
175             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
176             node_id = media_node.attrib['url']
177             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
178         except IndexError as err:
179             raise ExtractorError(u'Invalid manifest file')
180
181         url_pr = compat_urllib_parse_urlparse(manifest_url)
182         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
183
184         info['url'] = url
185         info['ext'] = 'f4f'
186         return [info]
187
188
189 class XVideosIE(InfoExtractor):
190     """Information extractor for xvideos.com"""
191
192     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
193     IE_NAME = u'xvideos'
194
195     def _real_extract(self, url):
196         mobj = re.match(self._VALID_URL, url)
197         if mobj is None:
198             raise ExtractorError(u'Invalid URL: %s' % url)
199         video_id = mobj.group(1)
200
201         webpage = self._download_webpage(url, video_id)
202
203         self.report_extraction(video_id)
204
205         # Extract video URL
206         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
207             webpage, u'video URL'))
208
209         # Extract title
210         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
211             webpage, u'title')
212
213         # Extract video thumbnail
214         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
215             webpage, u'thumbnail', fatal=False)
216
217         info = {
218             'id': video_id,
219             'url': video_url,
220             'uploader': None,
221             'upload_date': None,
222             'title': video_title,
223             'ext': 'flv',
224             'thumbnail': video_thumbnail,
225             'description': None,
226         }
227
228         return [info]
229
230
231
232
233 class InfoQIE(InfoExtractor):
234     """Information extractor for infoq.com"""
235     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
236
237     def _real_extract(self, url):
238         mobj = re.match(self._VALID_URL, url)
239         if mobj is None:
240             raise ExtractorError(u'Invalid URL: %s' % url)
241
242         webpage = self._download_webpage(url, video_id=url)
243         self.report_extraction(url)
244
245         # Extract video URL
246         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
247         if mobj is None:
248             raise ExtractorError(u'Unable to extract video url')
249         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
250         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
251
252         # Extract title
253         video_title = self._search_regex(r'contentTitle = "(.*?)";',
254             webpage, u'title')
255
256         # Extract description
257         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
258             webpage, u'description', fatal=False)
259
260         video_filename = video_url.split('/')[-1]
261         video_id, extension = video_filename.split('.')
262
263         info = {
264             'id': video_id,
265             'url': video_url,
266             'uploader': None,
267             'upload_date': None,
268             'title': video_title,
269             'ext': extension, # Extension is always(?) mp4, but seems to be flv
270             'thumbnail': None,
271             'description': video_description,
272         }
273
274         return [info]
275
276 class MixcloudIE(InfoExtractor):
277     """Information extractor for www.mixcloud.com"""
278
279     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
280     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
281     IE_NAME = u'mixcloud'
282
283     def report_download_json(self, file_id):
284         """Report JSON download."""
285         self.to_screen(u'Downloading json')
286
287     def get_urls(self, jsonData, fmt, bitrate='best'):
288         """Get urls from 'audio_formats' section in json"""
289         file_url = None
290         try:
291             bitrate_list = jsonData[fmt]
292             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
293                 bitrate = max(bitrate_list) # select highest
294
295             url_list = jsonData[fmt][bitrate]
296         except TypeError: # we have no bitrate info.
297             url_list = jsonData[fmt]
298         return url_list
299
300     def check_urls(self, url_list):
301         """Returns 1st active url from list"""
302         for url in url_list:
303             try:
304                 compat_urllib_request.urlopen(url)
305                 return url
306             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
307                 url = None
308
309         return None
310
311     def _print_formats(self, formats):
312         print('Available formats:')
313         for fmt in formats.keys():
314             for b in formats[fmt]:
315                 try:
316                     ext = formats[fmt][b][0]
317                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
318                 except TypeError: # we have no bitrate info
319                     ext = formats[fmt][0]
320                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
321                     break
322
323     def _real_extract(self, url):
324         mobj = re.match(self._VALID_URL, url)
325         if mobj is None:
326             raise ExtractorError(u'Invalid URL: %s' % url)
327         # extract uploader & filename from url
328         uploader = mobj.group(1).decode('utf-8')
329         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
330
331         # construct API request
332         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
333         # retrieve .json file with links to files
334         request = compat_urllib_request.Request(file_url)
335         try:
336             self.report_download_json(file_url)
337             jsonData = compat_urllib_request.urlopen(request).read()
338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
339             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
340
341         # parse JSON
342         json_data = json.loads(jsonData)
343         player_url = json_data['player_swf_url']
344         formats = dict(json_data['audio_formats'])
345
346         req_format = self._downloader.params.get('format', None)
347         bitrate = None
348
349         if self._downloader.params.get('listformats', None):
350             self._print_formats(formats)
351             return
352
353         if req_format is None or req_format == 'best':
354             for format_param in formats.keys():
355                 url_list = self.get_urls(formats, format_param)
356                 # check urls
357                 file_url = self.check_urls(url_list)
358                 if file_url is not None:
359                     break # got it!
360         else:
361             if req_format not in formats:
362                 raise ExtractorError(u'Format is not available')
363
364             url_list = self.get_urls(formats, req_format)
365             file_url = self.check_urls(url_list)
366             format_param = req_format
367
368         return [{
369             'id': file_id.decode('utf-8'),
370             'url': file_url.decode('utf-8'),
371             'uploader': uploader.decode('utf-8'),
372             'upload_date': None,
373             'title': json_data['name'],
374             'ext': file_url.split('.')[-1].decode('utf-8'),
375             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
376             'thumbnail': json_data['thumbnail_url'],
377             'description': json_data['description'],
378             'player_url': player_url.decode('utf-8'),
379         }]
380
381 class StanfordOpenClassroomIE(InfoExtractor):
382     """Information extractor for Stanford's Open ClassRoom"""
383
384     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
385     IE_NAME = u'stanfordoc'
386
387     def _real_extract(self, url):
388         mobj = re.match(self._VALID_URL, url)
389         if mobj is None:
390             raise ExtractorError(u'Invalid URL: %s' % url)
391
392         if mobj.group('course') and mobj.group('video'): # A specific video
393             course = mobj.group('course')
394             video = mobj.group('video')
395             info = {
396                 'id': course + '_' + video,
397                 'uploader': None,
398                 'upload_date': None,
399             }
400
401             self.report_extraction(info['id'])
402             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
403             xmlUrl = baseUrl + video + '.xml'
404             try:
405                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
406             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
407                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
408             mdoc = xml.etree.ElementTree.fromstring(metaXml)
409             try:
410                 info['title'] = mdoc.findall('./title')[0].text
411                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
412             except IndexError:
413                 raise ExtractorError(u'Invalid metadata XML file')
414             info['ext'] = info['url'].rpartition('.')[2]
415             return [info]
416         elif mobj.group('course'): # A course page
417             course = mobj.group('course')
418             info = {
419                 'id': course,
420                 'type': 'playlist',
421                 'uploader': None,
422                 'upload_date': None,
423             }
424
425             coursepage = self._download_webpage(url, info['id'],
426                                         note='Downloading course info page',
427                                         errnote='Unable to download course info page')
428
429             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
430
431             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
432                 coursepage, u'description', fatal=False)
433
434             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
435             info['list'] = [
436                 {
437                     'type': 'reference',
438                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
439                 }
440                     for vpage in links]
441             results = []
442             for entry in info['list']:
443                 assert entry['type'] == 'reference'
444                 results += self.extract(entry['url'])
445             return results
446         else: # Root page
447             info = {
448                 'id': 'Stanford OpenClassroom',
449                 'type': 'playlist',
450                 'uploader': None,
451                 'upload_date': None,
452             }
453
454             self.report_download_webpage(info['id'])
455             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
456             try:
457                 rootpage = compat_urllib_request.urlopen(rootURL).read()
458             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
459                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
460
461             info['title'] = info['id']
462
463             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
464             info['list'] = [
465                 {
466                     'type': 'reference',
467                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
468                 }
469                     for cpage in links]
470
471             results = []
472             for entry in info['list']:
473                 assert entry['type'] == 'reference'
474                 results += self.extract(entry['url'])
475             return results
476
477 class MTVIE(InfoExtractor):
478     """Information extractor for MTV.com"""
479
480     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
481     IE_NAME = u'mtv'
482
483     def _real_extract(self, url):
484         mobj = re.match(self._VALID_URL, url)
485         if mobj is None:
486             raise ExtractorError(u'Invalid URL: %s' % url)
487         if not mobj.group('proto'):
488             url = 'http://' + url
489         video_id = mobj.group('videoid')
490
491         webpage = self._download_webpage(url, video_id)
492
493         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
494             webpage, u'song name', fatal=False)
495
496         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
497             webpage, u'title')
498
499         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
500             webpage, u'mtvn_uri', fatal=False)
501
502         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
503             webpage, u'content id', fatal=False)
504
505         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
506         self.report_extraction(video_id)
507         request = compat_urllib_request.Request(videogen_url)
508         try:
509             metadataXml = compat_urllib_request.urlopen(request).read()
510         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
511             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
512
513         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
514         renditions = mdoc.findall('.//rendition')
515
516         # For now, always pick the highest quality.
517         rendition = renditions[-1]
518
519         try:
520             _,_,ext = rendition.attrib['type'].partition('/')
521             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
522             video_url = rendition.find('./src').text
523         except KeyError:
524             raise ExtractorError('Invalid rendition field.')
525
526         info = {
527             'id': video_id,
528             'url': video_url,
529             'uploader': performer,
530             'upload_date': None,
531             'title': video_title,
532             'ext': ext,
533             'format': format,
534         }
535
536         return [info]
537
538
539 class YoukuIE(InfoExtractor):
540     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
541
542     def _gen_sid(self):
543         nowTime = int(time.time() * 1000)
544         random1 = random.randint(1000,1998)
545         random2 = random.randint(1000,9999)
546
547         return "%d%d%d" %(nowTime,random1,random2)
548
549     def _get_file_ID_mix_string(self, seed):
550         mixed = []
551         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
552         seed = float(seed)
553         for i in range(len(source)):
554             seed  =  (seed * 211 + 30031 ) % 65536
555             index  =  math.floor(seed / 65536 * len(source) )
556             mixed.append(source[int(index)])
557             source.remove(source[int(index)])
558         #return ''.join(mixed)
559         return mixed
560
561     def _get_file_id(self, fileId, seed):
562         mixed = self._get_file_ID_mix_string(seed)
563         ids = fileId.split('*')
564         realId = []
565         for ch in ids:
566             if ch:
567                 realId.append(mixed[int(ch)])
568         return ''.join(realId)
569
570     def _real_extract(self, url):
571         mobj = re.match(self._VALID_URL, url)
572         if mobj is None:
573             raise ExtractorError(u'Invalid URL: %s' % url)
574         video_id = mobj.group('ID')
575
576         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
577
578         jsondata = self._download_webpage(info_url, video_id)
579
580         self.report_extraction(video_id)
581         try:
582             config = json.loads(jsondata)
583
584             video_title =  config['data'][0]['title']
585             seed = config['data'][0]['seed']
586
587             format = self._downloader.params.get('format', None)
588             supported_format = list(config['data'][0]['streamfileids'].keys())
589
590             if format is None or format == 'best':
591                 if 'hd2' in supported_format:
592                     format = 'hd2'
593                 else:
594                     format = 'flv'
595                 ext = u'flv'
596             elif format == 'worst':
597                 format = 'mp4'
598                 ext = u'mp4'
599             else:
600                 format = 'flv'
601                 ext = u'flv'
602
603
604             fileid = config['data'][0]['streamfileids'][format]
605             keys = [s['k'] for s in config['data'][0]['segs'][format]]
606         except (UnicodeDecodeError, ValueError, KeyError):
607             raise ExtractorError(u'Unable to extract info section')
608
609         files_info=[]
610         sid = self._gen_sid()
611         fileid = self._get_file_id(fileid, seed)
612
613         #column 8,9 of fileid represent the segment number
614         #fileid[7:9] should be changed
615         for index, key in enumerate(keys):
616
617             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
618             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
619
620             info = {
621                 'id': '%s_part%02d' % (video_id, index),
622                 'url': download_url,
623                 'uploader': None,
624                 'upload_date': None,
625                 'title': video_title,
626                 'ext': ext,
627             }
628             files_info.append(info)
629
630         return files_info
631
632
633 class XNXXIE(InfoExtractor):
634     """Information extractor for xnxx.com"""
635
636     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
637     IE_NAME = u'xnxx'
638     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
639     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
640     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
641
642     def _real_extract(self, url):
643         mobj = re.match(self._VALID_URL, url)
644         if mobj is None:
645             raise ExtractorError(u'Invalid URL: %s' % url)
646         video_id = mobj.group(1)
647
648         # Get webpage content
649         webpage = self._download_webpage(url, video_id)
650
651         video_url = self._search_regex(self.VIDEO_URL_RE,
652             webpage, u'video URL')
653         video_url = compat_urllib_parse.unquote(video_url)
654
655         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
656             webpage, u'title')
657
658         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
659             webpage, u'thumbnail', fatal=False)
660
661         return [{
662             'id': video_id,
663             'url': video_url,
664             'uploader': None,
665             'upload_date': None,
666             'title': video_title,
667             'ext': 'flv',
668             'thumbnail': video_thumbnail,
669             'description': None,
670         }]
671
672
673
674 class NBAIE(InfoExtractor):
675     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
676     IE_NAME = u'nba'
677
678     def _real_extract(self, url):
679         mobj = re.match(self._VALID_URL, url)
680         if mobj is None:
681             raise ExtractorError(u'Invalid URL: %s' % url)
682
683         video_id = mobj.group(1)
684
685         webpage = self._download_webpage(url, video_id)
686
687         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
688
689         shortened_video_id = video_id.rpartition('/')[2]
690         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
691             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
692
693         # It isn't there in the HTML it returns to us
694         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
695
696         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
697
698         info = {
699             'id': shortened_video_id,
700             'url': video_url,
701             'ext': 'mp4',
702             'title': title,
703             # 'uploader_date': uploader_date,
704             'description': description,
705         }
706         return [info]
707
708 class JustinTVIE(InfoExtractor):
709     """Information extractor for justin.tv and twitch.tv"""
710     # TODO: One broadcast may be split into multiple videos. The key
711     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
712     # starts at 1 and increases. Can we treat all parts as one video?
713
714     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
715         (?:
716             (?P<channelid>[^/]+)|
717             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
718             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
719         )
720         /?(?:\#.*)?$
721         """
722     _JUSTIN_PAGE_LIMIT = 100
723     IE_NAME = u'justin.tv'
724
725     def report_download_page(self, channel, offset):
726         """Report attempt to download a single page of videos."""
727         self.to_screen(u'%s: Downloading video information from %d to %d' %
728                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
729
730     # Return count of items, list of *valid* items
731     def _parse_page(self, url, video_id):
732         webpage = self._download_webpage(url, video_id,
733                                          u'Downloading video info JSON',
734                                          u'unable to download video info JSON')
735
736         response = json.loads(webpage)
737         if type(response) != list:
738             error_text = response.get('error', 'unknown error')
739             raise ExtractorError(u'Justin.tv API: %s' % error_text)
740         info = []
741         for clip in response:
742             video_url = clip['video_file_url']
743             if video_url:
744                 video_extension = os.path.splitext(video_url)[1][1:]
745                 video_date = re.sub('-', '', clip['start_time'][:10])
746                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
747                 video_id = clip['id']
748                 video_title = clip.get('title', video_id)
749                 info.append({
750                     'id': video_id,
751                     'url': video_url,
752                     'title': video_title,
753                     'uploader': clip.get('channel_name', video_uploader_id),
754                     'uploader_id': video_uploader_id,
755                     'upload_date': video_date,
756                     'ext': video_extension,
757                 })
758         return (len(response), info)
759
760     def _real_extract(self, url):
761         mobj = re.match(self._VALID_URL, url)
762         if mobj is None:
763             raise ExtractorError(u'invalid URL: %s' % url)
764
765         api_base = 'http://api.justin.tv'
766         paged = False
767         if mobj.group('channelid'):
768             paged = True
769             video_id = mobj.group('channelid')
770             api = api_base + '/channel/archives/%s.json' % video_id
771         elif mobj.group('chapterid'):
772             chapter_id = mobj.group('chapterid')
773
774             webpage = self._download_webpage(url, chapter_id)
775             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
776             if not m:
777                 raise ExtractorError(u'Cannot find archive of a chapter')
778             archive_id = m.group(1)
779
780             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
781             chapter_info_xml = self._download_webpage(api, chapter_id,
782                                              note=u'Downloading chapter information',
783                                              errnote=u'Chapter information download failed')
784             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
785             for a in doc.findall('.//archive'):
786                 if archive_id == a.find('./id').text:
787                     break
788             else:
789                 raise ExtractorError(u'Could not find chapter in chapter information')
790
791             video_url = a.find('./video_file_url').text
792             video_ext = video_url.rpartition('.')[2] or u'flv'
793
794             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
795             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
796                                    note='Downloading chapter metadata',
797                                    errnote='Download of chapter metadata failed')
798             chapter_info = json.loads(chapter_info_json)
799
800             bracket_start = int(doc.find('.//bracket_start').text)
801             bracket_end = int(doc.find('.//bracket_end').text)
802
803             # TODO determine start (and probably fix up file)
804             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
805             #video_url += u'?start=' + TODO:start_timestamp
806             # bracket_start is 13290, but we want 51670615
807             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
808                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
809
810             info = {
811                 'id': u'c' + chapter_id,
812                 'url': video_url,
813                 'ext': video_ext,
814                 'title': chapter_info['title'],
815                 'thumbnail': chapter_info['preview'],
816                 'description': chapter_info['description'],
817                 'uploader': chapter_info['channel']['display_name'],
818                 'uploader_id': chapter_info['channel']['name'],
819             }
820             return [info]
821         else:
822             video_id = mobj.group('videoid')
823             api = api_base + '/broadcast/by_archive/%s.json' % video_id
824
825         self.report_extraction(video_id)
826
827         info = []
828         offset = 0
829         limit = self._JUSTIN_PAGE_LIMIT
830         while True:
831             if paged:
832                 self.report_download_page(video_id, offset)
833             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
834             page_count, page_info = self._parse_page(page_url, video_id)
835             info.extend(page_info)
836             if not paged or page_count != limit:
837                 break
838             offset += limit
839         return info
840
841 class FunnyOrDieIE(InfoExtractor):
842     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
843
844     def _real_extract(self, url):
845         mobj = re.match(self._VALID_URL, url)
846         if mobj is None:
847             raise ExtractorError(u'invalid URL: %s' % url)
848
849         video_id = mobj.group('id')
850         webpage = self._download_webpage(url, video_id)
851
852         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
853             webpage, u'video URL', flags=re.DOTALL)
854
855         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
856             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
857
858         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
859             webpage, u'description', fatal=False, flags=re.DOTALL)
860
861         info = {
862             'id': video_id,
863             'url': video_url,
864             'ext': 'mp4',
865             'title': title,
866             'description': video_description,
867         }
868         return [info]
869
870 class SteamIE(InfoExtractor):
871     _VALID_URL = r"""http://store\.steampowered\.com/
872                 (agecheck/)?
873                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
874                 (?P<gameID>\d+)/?
875                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
876                 """
877     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
878     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
879
880     @classmethod
881     def suitable(cls, url):
882         """Receives a URL and returns True if suitable for this IE."""
883         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
884
885     def _real_extract(self, url):
886         m = re.match(self._VALID_URL, url, re.VERBOSE)
887         gameID = m.group('gameID')
888
889         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
890         webpage = self._download_webpage(videourl, gameID)
891
892         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
893             videourl = self._AGECHECK_TEMPLATE % gameID
894             self.report_age_confirmation()
895             webpage = self._download_webpage(videourl, gameID)
896
897         self.report_extraction(gameID)
898         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
899                                              webpage, 'game title')
900
901         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
902         mweb = re.finditer(urlRE, webpage)
903         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
904         titles = re.finditer(namesRE, webpage)
905         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
906         thumbs = re.finditer(thumbsRE, webpage)
907         videos = []
908         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
909             video_id = vid.group('videoID')
910             title = vtitle.group('videoName')
911             video_url = vid.group('videoURL')
912             video_thumb = thumb.group('thumbnail')
913             if not video_url:
914                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
915             info = {
916                 'id':video_id,
917                 'url':video_url,
918                 'ext': 'flv',
919                 'title': unescapeHTML(title),
920                 'thumbnail': video_thumb
921                   }
922             videos.append(info)
923         return [self.playlist_result(videos, gameID, game_title)]
924
925 class UstreamIE(InfoExtractor):
926     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
927     IE_NAME = u'ustream'
928
929     def _real_extract(self, url):
930         m = re.match(self._VALID_URL, url)
931         video_id = m.group('videoID')
932
933         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
934         webpage = self._download_webpage(url, video_id)
935
936         self.report_extraction(video_id)
937
938         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
939             webpage, u'title')
940
941         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
942             webpage, u'uploader', fatal=False, flags=re.DOTALL)
943
944         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
945             webpage, u'thumbnail', fatal=False)
946
947         info = {
948                 'id': video_id,
949                 'url': video_url,
950                 'ext': 'flv',
951                 'title': video_title,
952                 'uploader': uploader,
953                 'thumbnail': thumbnail,
954                }
955         return info
956
957 class WorldStarHipHopIE(InfoExtractor):
958     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
959     IE_NAME = u'WorldStarHipHop'
960
961     def _real_extract(self, url):
962         m = re.match(self._VALID_URL, url)
963         video_id = m.group('id')
964
965         webpage_src = self._download_webpage(url, video_id)
966
967         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
968             webpage_src, u'video URL')
969
970         if 'mp4' in video_url:
971             ext = 'mp4'
972         else:
973             ext = 'flv'
974
975         video_title = self._html_search_regex(r"<title>(.*)</title>",
976             webpage_src, u'title')
977
978         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
979         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
980             webpage_src, u'thumbnail', fatal=False)
981
982         if not thumbnail:
983             _title = r"""candytitles.*>(.*)</span>"""
984             mobj = re.search(_title, webpage_src)
985             if mobj is not None:
986                 video_title = mobj.group(1)
987
988         results = [{
989                     'id': video_id,
990                     'url' : video_url,
991                     'title' : video_title,
992                     'thumbnail' : thumbnail,
993                     'ext' : ext,
994                     }]
995         return results
996
997 class RBMARadioIE(InfoExtractor):
998     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
999
1000     def _real_extract(self, url):
1001         m = re.match(self._VALID_URL, url)
1002         video_id = m.group('videoID')
1003
1004         webpage = self._download_webpage(url, video_id)
1005
1006         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1007             webpage, u'json data', flags=re.MULTILINE)
1008
1009         try:
1010             data = json.loads(json_data)
1011         except ValueError as e:
1012             raise ExtractorError(u'Invalid JSON: ' + str(e))
1013
1014         video_url = data['akamai_url'] + '&cbr=256'
1015         url_parts = compat_urllib_parse_urlparse(video_url)
1016         video_ext = url_parts.path.rpartition('.')[2]
1017         info = {
1018                 'id': video_id,
1019                 'url': video_url,
1020                 'ext': video_ext,
1021                 'title': data['title'],
1022                 'description': data.get('teaser_text'),
1023                 'location': data.get('country_of_origin'),
1024                 'uploader': data.get('host', {}).get('name'),
1025                 'uploader_id': data.get('host', {}).get('slug'),
1026                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1027                 'duration': data.get('duration'),
1028         }
1029         return [info]
1030
1031
1032 class YouPornIE(InfoExtractor):
1033     """Information extractor for youporn.com."""
1034     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1035
1036     def _print_formats(self, formats):
1037         """Print all available formats"""
1038         print(u'Available formats:')
1039         print(u'ext\t\tformat')
1040         print(u'---------------------------------')
1041         for format in formats:
1042             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1043
1044     def _specific(self, req_format, formats):
1045         for x in formats:
1046             if(x["format"]==req_format):
1047                 return x
1048         return None
1049
1050     def _real_extract(self, url):
1051         mobj = re.match(self._VALID_URL, url)
1052         if mobj is None:
1053             raise ExtractorError(u'Invalid URL: %s' % url)
1054         video_id = mobj.group('videoid')
1055
1056         req = compat_urllib_request.Request(url)
1057         req.add_header('Cookie', 'age_verified=1')
1058         webpage = self._download_webpage(req, video_id)
1059
1060         # Get JSON parameters
1061         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1062         try:
1063             params = json.loads(json_params)
1064         except:
1065             raise ExtractorError(u'Invalid JSON')
1066
1067         self.report_extraction(video_id)
1068         try:
1069             video_title = params['title']
1070             upload_date = unified_strdate(params['release_date_f'])
1071             video_description = params['description']
1072             video_uploader = params['submitted_by']
1073             thumbnail = params['thumbnails'][0]['image']
1074         except KeyError:
1075             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1076
1077         # Get all of the formats available
1078         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1079         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1080             webpage, u'download list').strip()
1081
1082         # Get all of the links from the page
1083         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1084         links = re.findall(LINK_RE, download_list_html)
1085         if(len(links) == 0):
1086             raise ExtractorError(u'ERROR: no known formats available for video')
1087
1088         self.to_screen(u'Links found: %d' % len(links))
1089
1090         formats = []
1091         for link in links:
1092
1093             # A link looks like this:
1094             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1095             # A path looks like this:
1096             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1097             video_url = unescapeHTML( link )
1098             path = compat_urllib_parse_urlparse( video_url ).path
1099             extension = os.path.splitext( path )[1][1:]
1100             format = path.split('/')[4].split('_')[:2]
1101             size = format[0]
1102             bitrate = format[1]
1103             format = "-".join( format )
1104             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1105
1106             formats.append({
1107                 'id': video_id,
1108                 'url': video_url,
1109                 'uploader': video_uploader,
1110                 'upload_date': upload_date,
1111                 'title': video_title,
1112                 'ext': extension,
1113                 'format': format,
1114                 'thumbnail': thumbnail,
1115                 'description': video_description
1116             })
1117
1118         if self._downloader.params.get('listformats', None):
1119             self._print_formats(formats)
1120             return
1121
1122         req_format = self._downloader.params.get('format', None)
1123         self.to_screen(u'Format: %s' % req_format)
1124
1125         if req_format is None or req_format == 'best':
1126             return [formats[0]]
1127         elif req_format == 'worst':
1128             return [formats[-1]]
1129         elif req_format in ('-1', 'all'):
1130             return formats
1131         else:
1132             format = self._specific( req_format, formats )
1133             if result is None:
1134                 raise ExtractorError(u'Requested format not available')
1135             return [format]
1136
1137
1138
1139 class PornotubeIE(InfoExtractor):
1140     """Information extractor for pornotube.com."""
1141     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1142
1143     def _real_extract(self, url):
1144         mobj = re.match(self._VALID_URL, url)
1145         if mobj is None:
1146             raise ExtractorError(u'Invalid URL: %s' % url)
1147
1148         video_id = mobj.group('videoid')
1149         video_title = mobj.group('title')
1150
1151         # Get webpage content
1152         webpage = self._download_webpage(url, video_id)
1153
1154         # Get the video URL
1155         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1156         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1157         video_url = compat_urllib_parse.unquote(video_url)
1158
1159         #Get the uploaded date
1160         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1161         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1162         if upload_date: upload_date = unified_strdate(upload_date)
1163
1164         info = {'id': video_id,
1165                 'url': video_url,
1166                 'uploader': None,
1167                 'upload_date': upload_date,
1168                 'title': video_title,
1169                 'ext': 'flv',
1170                 'format': 'flv'}
1171
1172         return [info]
1173
1174 class YouJizzIE(InfoExtractor):
1175     """Information extractor for youjizz.com."""
1176     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1177
1178     def _real_extract(self, url):
1179         mobj = re.match(self._VALID_URL, url)
1180         if mobj is None:
1181             raise ExtractorError(u'Invalid URL: %s' % url)
1182
1183         video_id = mobj.group('videoid')
1184
1185         # Get webpage content
1186         webpage = self._download_webpage(url, video_id)
1187
1188         # Get the video title
1189         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1190             webpage, u'title').strip()
1191
1192         # Get the embed page
1193         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1194         if result is None:
1195             raise ExtractorError(u'ERROR: unable to extract embed page')
1196
1197         embed_page_url = result.group(0).strip()
1198         video_id = result.group('videoid')
1199
1200         webpage = self._download_webpage(embed_page_url, video_id)
1201
1202         # Get the video URL
1203         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1204             webpage, u'video URL')
1205
1206         info = {'id': video_id,
1207                 'url': video_url,
1208                 'title': video_title,
1209                 'ext': 'flv',
1210                 'format': 'flv',
1211                 'player_url': embed_page_url}
1212
1213         return [info]
1214
1215 class EightTracksIE(InfoExtractor):
1216     IE_NAME = '8tracks'
1217     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1218
1219     def _real_extract(self, url):
1220         mobj = re.match(self._VALID_URL, url)
1221         if mobj is None:
1222             raise ExtractorError(u'Invalid URL: %s' % url)
1223         playlist_id = mobj.group('id')
1224
1225         webpage = self._download_webpage(url, playlist_id)
1226
1227         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1228         data = json.loads(json_like)
1229
1230         session = str(random.randint(0, 1000000000))
1231         mix_id = data['id']
1232         track_count = data['tracks_count']
1233         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1234         next_url = first_url
1235         res = []
1236         for i in itertools.count():
1237             api_json = self._download_webpage(next_url, playlist_id,
1238                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1239                 errnote=u'Failed to download song information')
1240             api_data = json.loads(api_json)
1241             track_data = api_data[u'set']['track']
1242             info = {
1243                 'id': track_data['id'],
1244                 'url': track_data['track_file_stream_url'],
1245                 'title': track_data['performer'] + u' - ' + track_data['name'],
1246                 'raw_title': track_data['name'],
1247                 'uploader_id': data['user']['login'],
1248                 'ext': 'm4a',
1249             }
1250             res.append(info)
1251             if api_data['set']['at_last_track']:
1252                 break
1253             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1254         return res
1255
1256 class KeekIE(InfoExtractor):
1257     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1258     IE_NAME = u'keek'
1259
1260     def _real_extract(self, url):
1261         m = re.match(self._VALID_URL, url)
1262         video_id = m.group('videoID')
1263
1264         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1265         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1266         webpage = self._download_webpage(url, video_id)
1267
1268         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1269             webpage, u'title')
1270
1271         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1272             webpage, u'uploader', fatal=False)
1273
1274         info = {
1275                 'id': video_id,
1276                 'url': video_url,
1277                 'ext': 'mp4',
1278                 'title': video_title,
1279                 'thumbnail': thumbnail,
1280                 'uploader': uploader
1281         }
1282         return [info]
1283
1284 class TEDIE(InfoExtractor):
1285     _VALID_URL=r'''http://www\.ted\.com/
1286                    (
1287                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1288                         |
1289                         ((?P<type_talk>talks)) # We have a simple talk
1290                    )
1291                    (/lang/(.*?))? # The url may contain the language
1292                    /(?P<name>\w+) # Here goes the name and then ".html"
1293                    '''
1294
1295     @classmethod
1296     def suitable(cls, url):
1297         """Receives a URL and returns True if suitable for this IE."""
1298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1299
1300     def _real_extract(self, url):
1301         m=re.match(self._VALID_URL, url, re.VERBOSE)
1302         if m.group('type_talk'):
1303             return [self._talk_info(url)]
1304         else :
1305             playlist_id=m.group('playlist_id')
1306             name=m.group('name')
1307             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1308             return [self._playlist_videos_info(url,name,playlist_id)]
1309
1310     def _playlist_videos_info(self,url,name,playlist_id=0):
1311         '''Returns the videos of the playlist'''
1312         video_RE=r'''
1313                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1314                      ([.\s]*?)data-playlist_item_id="(\d+)"
1315                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1316                      '''
1317         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1318         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1319         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1320         m_names=re.finditer(video_name_RE,webpage)
1321
1322         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1323                                                  webpage, 'playlist title')
1324
1325         playlist_entries = []
1326         for m_video, m_name in zip(m_videos,m_names):
1327             video_id=m_video.group('video_id')
1328             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1329             playlist_entries.append(self.url_result(talk_url, 'TED'))
1330         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1331
1332     def _talk_info(self, url, video_id=0):
1333         """Return the video for the talk in the url"""
1334         m = re.match(self._VALID_URL, url,re.VERBOSE)
1335         video_name = m.group('name')
1336         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1337         self.report_extraction(video_name)
1338         # If the url includes the language we get the title translated
1339         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1340                                         webpage, 'title')
1341         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1342                                     webpage, 'json data')
1343         info = json.loads(json_data)
1344         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1345                                        webpage, 'description', flags = re.DOTALL)
1346         
1347         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1348                                        webpage, 'thumbnail')
1349         info = {
1350                 'id': info['id'],
1351                 'url': info['htmlStreams'][-1]['file'],
1352                 'ext': 'mp4',
1353                 'title': title,
1354                 'thumbnail': thumbnail,
1355                 'description': desc,
1356                 }
1357         return info
1358
1359 class MySpassIE(InfoExtractor):
1360     _VALID_URL = r'http://www.myspass.de/.*'
1361
1362     def _real_extract(self, url):
1363         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1364
1365         # video id is the last path element of the URL
1366         # usually there is a trailing slash, so also try the second but last
1367         url_path = compat_urllib_parse_urlparse(url).path
1368         url_parent_path, video_id = os.path.split(url_path)
1369         if not video_id:
1370             _, video_id = os.path.split(url_parent_path)
1371
1372         # get metadata
1373         metadata_url = META_DATA_URL_TEMPLATE % video_id
1374         metadata_text = self._download_webpage(metadata_url, video_id)
1375         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1376
1377         # extract values from metadata
1378         url_flv_el = metadata.find('url_flv')
1379         if url_flv_el is None:
1380             raise ExtractorError(u'Unable to extract download url')
1381         video_url = url_flv_el.text
1382         extension = os.path.splitext(video_url)[1][1:]
1383         title_el = metadata.find('title')
1384         if title_el is None:
1385             raise ExtractorError(u'Unable to extract title')
1386         title = title_el.text
1387         format_id_el = metadata.find('format_id')
1388         if format_id_el is None:
1389             format = ext
1390         else:
1391             format = format_id_el.text
1392         description_el = metadata.find('description')
1393         if description_el is not None:
1394             description = description_el.text
1395         else:
1396             description = None
1397         imagePreview_el = metadata.find('imagePreview')
1398         if imagePreview_el is not None:
1399             thumbnail = imagePreview_el.text
1400         else:
1401             thumbnail = None
1402         info = {
1403             'id': video_id,
1404             'url': video_url,
1405             'title': title,
1406             'ext': extension,
1407             'format': format,
1408             'thumbnail': thumbnail,
1409             'description': description
1410         }
1411         return [info]
1412
1413 class SpiegelIE(InfoExtractor):
1414     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1415
1416     def _real_extract(self, url):
1417         m = re.match(self._VALID_URL, url)
1418         video_id = m.group('videoID')
1419
1420         webpage = self._download_webpage(url, video_id)
1421
1422         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1423             webpage, u'title')
1424
1425         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1426         xml_code = self._download_webpage(xml_url, video_id,
1427                     note=u'Downloading XML', errnote=u'Failed to download XML')
1428
1429         idoc = xml.etree.ElementTree.fromstring(xml_code)
1430         last_type = idoc[-1]
1431         filename = last_type.findall('./filename')[0].text
1432         duration = float(last_type.findall('./duration')[0].text)
1433
1434         video_url = 'http://video2.spiegel.de/flash/' + filename
1435         video_ext = filename.rpartition('.')[2]
1436         info = {
1437             'id': video_id,
1438             'url': video_url,
1439             'ext': video_ext,
1440             'title': video_title,
1441             'duration': duration,
1442         }
1443         return [info]
1444
1445 class LiveLeakIE(InfoExtractor):
1446
1447     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1448     IE_NAME = u'liveleak'
1449
1450     def _real_extract(self, url):
1451         mobj = re.match(self._VALID_URL, url)
1452         if mobj is None:
1453             raise ExtractorError(u'Invalid URL: %s' % url)
1454
1455         video_id = mobj.group('video_id')
1456
1457         webpage = self._download_webpage(url, video_id)
1458
1459         video_url = self._search_regex(r'file: "(.*?)",',
1460             webpage, u'video URL')
1461
1462         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1463             webpage, u'title').replace('LiveLeak.com -', '').strip()
1464
1465         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1466             webpage, u'description', fatal=False)
1467
1468         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1469             webpage, u'uploader', fatal=False)
1470
1471         info = {
1472             'id':  video_id,
1473             'url': video_url,
1474             'ext': 'mp4',
1475             'title': video_title,
1476             'description': video_description,
1477             'uploader': video_uploader
1478         }
1479
1480         return [info]
1481
1482
1483
1484 class TumblrIE(InfoExtractor):
1485     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1486
1487     def _real_extract(self, url):
1488         m_url = re.match(self._VALID_URL, url)
1489         video_id = m_url.group('id')
1490         blog = m_url.group('blog_name')
1491
1492         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1493         webpage = self._download_webpage(url, video_id)
1494
1495         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1496         video = re.search(re_video, webpage)
1497         if video is None:
1498            raise ExtractorError(u'Unable to extract video')
1499         video_url = video.group('video_url')
1500         ext = video.group('ext')
1501
1502         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1503             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1504         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1505
1506         # The only place where you can get a title, it's not complete,
1507         # but searching in other places doesn't work for all videos
1508         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1509             webpage, u'title', flags=re.DOTALL)
1510
1511         return [{'id': video_id,
1512                  'url': video_url,
1513                  'title': video_title,
1514                  'thumbnail': video_thumbnail,
1515                  'ext': ext
1516                  }]
1517
1518 class BandcampIE(InfoExtractor):
1519     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1520
1521     def _real_extract(self, url):
1522         mobj = re.match(self._VALID_URL, url)
1523         title = mobj.group('title')
1524         webpage = self._download_webpage(url, title)
1525         # We get the link to the free download page
1526         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1527         if m_download is None:
1528             raise ExtractorError(u'No free songs found')
1529
1530         download_link = m_download.group(1)
1531         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1532                        webpage, re.MULTILINE|re.DOTALL).group('id')
1533
1534         download_webpage = self._download_webpage(download_link, id,
1535                                                   'Downloading free downloads page')
1536         # We get the dictionary of the track from some javascrip code
1537         info = re.search(r'items: (.*?),$',
1538                          download_webpage, re.MULTILINE).group(1)
1539         info = json.loads(info)[0]
1540         # We pick mp3-320 for now, until format selection can be easily implemented.
1541         mp3_info = info[u'downloads'][u'mp3-320']
1542         # If we try to use this url it says the link has expired
1543         initial_url = mp3_info[u'url']
1544         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1545         m_url = re.match(re_url, initial_url)
1546         #We build the url we will use to get the final track url
1547         # This url is build in Bandcamp in the script download_bunde_*.js
1548         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1549         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1550         # If we could correctly generate the .rand field the url would be
1551         #in the "download_url" key
1552         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1553
1554         track_info = {'id':id,
1555                       'title' : info[u'title'],
1556                       'ext' :   'mp3',
1557                       'url' :   final_url,
1558                       'thumbnail' : info[u'thumb_url'],
1559                       'uploader' :  info[u'artist']
1560                       }
1561
1562         return [track_info]
1563
1564 class RedTubeIE(InfoExtractor):
1565     """Information Extractor for redtube"""
1566     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1567
1568     def _real_extract(self,url):
1569         mobj = re.match(self._VALID_URL, url)
1570         if mobj is None:
1571             raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573         video_id = mobj.group('id')
1574         video_extension = 'mp4'        
1575         webpage = self._download_webpage(url, video_id)
1576
1577         self.report_extraction(video_id)
1578
1579         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1580             webpage, u'video URL')
1581
1582         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1583             webpage, u'title')
1584
1585         return [{
1586             'id':       video_id,
1587             'url':      video_url,
1588             'ext':      video_extension,
1589             'title':    video_title,
1590         }]
1591         
1592 class InaIE(InfoExtractor):
1593     """Information Extractor for Ina.fr"""
1594     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1595
1596     def _real_extract(self,url):
1597         mobj = re.match(self._VALID_URL, url)
1598
1599         video_id = mobj.group('id')
1600         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1601         video_extension = 'mp4'
1602         webpage = self._download_webpage(mrss_url, video_id)
1603
1604         self.report_extraction(video_id)
1605
1606         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1607             webpage, u'video URL')
1608
1609         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1610             webpage, u'title')
1611
1612         return [{
1613             'id':       video_id,
1614             'url':      video_url,
1615             'ext':      video_extension,
1616             'title':    video_title,
1617         }]
1618
1619 class HowcastIE(InfoExtractor):
1620     """Information Extractor for Howcast.com"""
1621     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1622
1623     def _real_extract(self, url):
1624         mobj = re.match(self._VALID_URL, url)
1625
1626         video_id = mobj.group('id')
1627         webpage_url = 'http://www.howcast.com/videos/' + video_id
1628         webpage = self._download_webpage(webpage_url, video_id)
1629
1630         self.report_extraction(video_id)
1631
1632         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1633             webpage, u'video URL')
1634
1635         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1636             webpage, u'title')
1637
1638         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1639             webpage, u'description', fatal=False)
1640
1641         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1642             webpage, u'thumbnail', fatal=False)
1643
1644         return [{
1645             'id':       video_id,
1646             'url':      video_url,
1647             'ext':      'mp4',
1648             'title':    video_title,
1649             'description': video_description,
1650             'thumbnail': thumbnail,
1651         }]
1652
1653 class VineIE(InfoExtractor):
1654     """Information Extractor for Vine.co"""
1655     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1656
1657     def _real_extract(self, url):
1658         mobj = re.match(self._VALID_URL, url)
1659
1660         video_id = mobj.group('id')
1661         webpage_url = 'https://vine.co/v/' + video_id
1662         webpage = self._download_webpage(webpage_url, video_id)
1663
1664         self.report_extraction(video_id)
1665
1666         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1667             webpage, u'video URL')
1668
1669         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1670             webpage, u'title')
1671
1672         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1673             webpage, u'thumbnail', fatal=False)
1674
1675         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1676             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1677
1678         return [{
1679             'id':        video_id,
1680             'url':       video_url,
1681             'ext':       'mp4',
1682             'title':     video_title,
1683             'thumbnail': thumbnail,
1684             'uploader':  uploader,
1685         }]
1686
1687 class FlickrIE(InfoExtractor):
1688     """Information Extractor for Flickr videos"""
1689     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1690
1691     def _real_extract(self, url):
1692         mobj = re.match(self._VALID_URL, url)
1693
1694         video_id = mobj.group('id')
1695         video_uploader_id = mobj.group('uploader_id')
1696         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1697         webpage = self._download_webpage(webpage_url, video_id)
1698
1699         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1700
1701         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1702         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1703
1704         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1705             first_xml, u'node_id')
1706
1707         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1708         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1709
1710         self.report_extraction(video_id)
1711
1712         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1713         if mobj is None:
1714             raise ExtractorError(u'Unable to extract video url')
1715         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1716
1717         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1718             webpage, u'video title')
1719
1720         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1721             webpage, u'description', fatal=False)
1722
1723         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1724             webpage, u'thumbnail', fatal=False)
1725
1726         return [{
1727             'id':          video_id,
1728             'url':         video_url,
1729             'ext':         'mp4',
1730             'title':       video_title,
1731             'description': video_description,
1732             'thumbnail':   thumbnail,
1733             'uploader_id': video_uploader_id,
1734         }]
1735
1736 class TeamcocoIE(InfoExtractor):
1737     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1738
1739     def _real_extract(self, url):
1740         mobj = re.match(self._VALID_URL, url)
1741         if mobj is None:
1742             raise ExtractorError(u'Invalid URL: %s' % url)
1743         url_title = mobj.group('url_title')
1744         webpage = self._download_webpage(url, url_title)
1745
1746         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1747             webpage, u'video id')
1748
1749         self.report_extraction(video_id)
1750
1751         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1752             webpage, u'title')
1753
1754         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1755             webpage, u'thumbnail', fatal=False)
1756
1757         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1758             webpage, u'description', fatal=False)
1759
1760         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1761         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1762
1763         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1764             data, u'video URL')
1765
1766         return [{
1767             'id':          video_id,
1768             'url':         video_url,
1769             'ext':         'mp4',
1770             'title':       video_title,
1771             'thumbnail':   thumbnail,
1772             'description': video_description,
1773         }]
1774
1775 class XHamsterIE(InfoExtractor):
1776     """Information Extractor for xHamster"""
1777     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1778
1779     def _real_extract(self,url):
1780         mobj = re.match(self._VALID_URL, url)
1781
1782         video_id = mobj.group('id')
1783         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1784         webpage = self._download_webpage(mrss_url, video_id)
1785
1786         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1787         if mobj is None:
1788             raise ExtractorError(u'Unable to extract media URL')
1789         if len(mobj.group('server')) == 0:
1790             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1791         else:
1792             video_url = mobj.group('server')+'/key='+mobj.group('file')
1793         video_extension = video_url.split('.')[-1]
1794
1795         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1796             webpage, u'title')
1797
1798         # Can't see the description anywhere in the UI
1799         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1800         #     webpage, u'description', fatal=False)
1801         # if video_description: video_description = unescapeHTML(video_description)
1802
1803         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1804         if mobj:
1805             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1806         else:
1807             video_upload_date = None
1808             self._downloader.report_warning(u'Unable to extract upload date')
1809
1810         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1811             webpage, u'uploader id', default=u'anonymous')
1812
1813         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1814             webpage, u'thumbnail', fatal=False)
1815
1816         return [{
1817             'id':       video_id,
1818             'url':      video_url,
1819             'ext':      video_extension,
1820             'title':    video_title,
1821             # 'description': video_description,
1822             'upload_date': video_upload_date,
1823             'uploader_id': video_uploader_id,
1824             'thumbnail': video_thumbnail
1825         }]
1826
1827 class HypemIE(InfoExtractor):
1828     """Information Extractor for hypem"""
1829     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1830
1831     def _real_extract(self, url):
1832         mobj = re.match(self._VALID_URL, url)
1833         if mobj is None:
1834             raise ExtractorError(u'Invalid URL: %s' % url)
1835         track_id = mobj.group(1)
1836
1837         data = { 'ax': 1, 'ts': time.time() }
1838         data_encoded = compat_urllib_parse.urlencode(data)
1839         complete_url = url + "?" + data_encoded
1840         request = compat_urllib_request.Request(complete_url)
1841         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1842         cookie = urlh.headers.get('Set-Cookie', '')
1843
1844         self.report_extraction(track_id)
1845
1846         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1847             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1848         try:
1849             track_list = json.loads(html_tracks)
1850             track = track_list[u'tracks'][0]
1851         except ValueError:
1852             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1853
1854         key = track[u"key"]
1855         track_id = track[u"id"]
1856         artist = track[u"artist"]
1857         title = track[u"song"]
1858
1859         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1860         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1861         request.add_header('cookie', cookie)
1862         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1863         try:
1864             song_data = json.loads(song_data_json)
1865         except ValueError:
1866             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1867         final_url = song_data[u"url"]
1868
1869         return [{
1870             'id':       track_id,
1871             'url':      final_url,
1872             'ext':      "mp3",
1873             'title':    title,
1874             'artist':   artist,
1875         }]
1876
1877 class Vbox7IE(InfoExtractor):
1878     """Information Extractor for Vbox7"""
1879     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1880
1881     def _real_extract(self,url):
1882         mobj = re.match(self._VALID_URL, url)
1883         if mobj is None:
1884             raise ExtractorError(u'Invalid URL: %s' % url)
1885         video_id = mobj.group(1)
1886
1887         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1888         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1889         redirect_url = urlh.geturl() + new_location
1890         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1891
1892         title = self._html_search_regex(r'<title>(.*)</title>',
1893             webpage, u'title').split('/')[0].strip()
1894
1895         ext = "flv"
1896         info_url = "http://vbox7.com/play/magare.do"
1897         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1898         info_request = compat_urllib_request.Request(info_url, data)
1899         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1900         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1901         if info_response is None:
1902             raise ExtractorError(u'Unable to extract the media url')
1903         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1904
1905         return [{
1906             'id':        video_id,
1907             'url':       final_url,
1908             'ext':       ext,
1909             'title':     title,
1910             'thumbnail': thumbnail_url,
1911         }]
1912
1913
1914 def gen_extractors():
1915     """ Return a list of an instance of every supported extractor.
1916     The order does matter; the first extractor matched is the one handling the URL.
1917     """
1918     return [
1919         YoutubePlaylistIE(),
1920         YoutubeChannelIE(),
1921         YoutubeUserIE(),
1922         YoutubeSearchIE(),
1923         YoutubeIE(),
1924         MetacafeIE(),
1925         DailymotionIE(),
1926         GoogleSearchIE(),
1927         PhotobucketIE(),
1928         YahooIE(),
1929         YahooSearchIE(),
1930         DepositFilesIE(),
1931         FacebookIE(),
1932         BlipTVIE(),
1933         BlipTVUserIE(),
1934         VimeoIE(),
1935         MyVideoIE(),
1936         ComedyCentralIE(),
1937         EscapistIE(),
1938         CollegeHumorIE(),
1939         XVideosIE(),
1940         SoundcloudSetIE(),
1941         SoundcloudIE(),
1942         InfoQIE(),
1943         MixcloudIE(),
1944         StanfordOpenClassroomIE(),
1945         MTVIE(),
1946         YoukuIE(),
1947         XNXXIE(),
1948         YouJizzIE(),
1949         PornotubeIE(),
1950         YouPornIE(),
1951         GooglePlusIE(),
1952         ArteTvIE(),
1953         NBAIE(),
1954         WorldStarHipHopIE(),
1955         JustinTVIE(),
1956         FunnyOrDieIE(),
1957         SteamIE(),
1958         UstreamIE(),
1959         RBMARadioIE(),
1960         EightTracksIE(),
1961         KeekIE(),
1962         TEDIE(),
1963         MySpassIE(),
1964         SpiegelIE(),
1965         LiveLeakIE(),
1966         ARDIE(),
1967         ZDFIE(),
1968         TumblrIE(),
1969         BandcampIE(),
1970         RedTubeIE(),
1971         InaIE(),
1972         HowcastIE(),
1973         VineIE(),
1974         FlickrIE(),
1975         TeamcocoIE(),
1976         XHamsterIE(),
1977         HypemIE(),
1978         Vbox7IE(),
1979         GametrailersIE(),
1980         StatigramIE(),
1981         GenericIE()
1982     ]
1983
1984 def get_info_extractor(ie_name):
1985     """Returns the info extractor class with the given ie_name"""
1986     return globals()[ie_name+'IE']