ca1e43404122ff05c750830b6667e686a7fc65c8
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.googleplus import GooglePlusIE
29 from .extractor.googlesearch import GoogleSearchIE
30 from .extractor.metacafe import MetacafeIE
31 from .extractor.myvideo import MyVideoIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.vimeo import VimeoIE
35 from .extractor.yahoo import YahooIE, YahooSearchIE
36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
37 from .extractor.zdf import ZDFIE
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 class DepositFilesIE(InfoExtractor):
58     """Information extractor for depositfiles.com"""
59
60     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
61
62     def _real_extract(self, url):
63         file_id = url.split('/')[-1]
64         # Rebuild url in english locale
65         url = 'http://depositfiles.com/en/files/' + file_id
66
67         # Retrieve file webpage with 'Free download' button pressed
68         free_download_indication = { 'gateway_result' : '1' }
69         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
70         try:
71             self.report_download_webpage(file_id)
72             webpage = compat_urllib_request.urlopen(request).read()
73         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
75
76         # Search for the real file URL
77         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
78         if (mobj is None) or (mobj.group(1) is None):
79             # Try to figure out reason of the error.
80             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
81             if (mobj is not None) and (mobj.group(1) is not None):
82                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
83                 raise ExtractorError(u'%s' % restriction_message)
84             else:
85                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
86
87         file_url = mobj.group(1)
88         file_extension = os.path.splitext(file_url)[1][1:]
89
90         # Search for file title
91         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
92
93         return [{
94             'id':       file_id.decode('utf-8'),
95             'url':      file_url.decode('utf-8'),
96             'uploader': None,
97             'upload_date':  None,
98             'title':    file_title,
99             'ext':      file_extension.decode('utf-8'),
100         }]
101
102
103 class FacebookIE(InfoExtractor):
104     """Information Extractor for Facebook"""
105
106     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
107     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
108     _NETRC_MACHINE = 'facebook'
109     IE_NAME = u'facebook'
110
111     def report_login(self):
112         """Report attempt to log in."""
113         self.to_screen(u'Logging in')
114
115     def _real_initialize(self):
116         if self._downloader is None:
117             return
118
119         useremail = None
120         password = None
121         downloader_params = self._downloader.params
122
123         # Attempt to use provided username and password or .netrc data
124         if downloader_params.get('username', None) is not None:
125             useremail = downloader_params['username']
126             password = downloader_params['password']
127         elif downloader_params.get('usenetrc', False):
128             try:
129                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
130                 if info is not None:
131                     useremail = info[0]
132                     password = info[2]
133                 else:
134                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
135             except (IOError, netrc.NetrcParseError) as err:
136                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
137                 return
138
139         if useremail is None:
140             return
141
142         # Log in
143         login_form = {
144             'email': useremail,
145             'pass': password,
146             'login': 'Log+In'
147             }
148         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
149         try:
150             self.report_login()
151             login_results = compat_urllib_request.urlopen(request).read()
152             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
153                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
154                 return
155         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
156             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
157             return
158
159     def _real_extract(self, url):
160         mobj = re.match(self._VALID_URL, url)
161         if mobj is None:
162             raise ExtractorError(u'Invalid URL: %s' % url)
163         video_id = mobj.group('ID')
164
165         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
166         webpage = self._download_webpage(url, video_id)
167
168         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
169         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
170         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
171         if not m:
172             raise ExtractorError(u'Cannot parse data')
173         data = dict(json.loads(m.group(1)))
174         params_raw = compat_urllib_parse.unquote(data['params'])
175         params = json.loads(params_raw)
176         video_data = params['video_data'][0]
177         video_url = video_data.get('hd_src')
178         if not video_url:
179             video_url = video_data['sd_src']
180         if not video_url:
181             raise ExtractorError(u'Cannot find video URL')
182         video_duration = int(video_data['video_duration'])
183         thumbnail = video_data['thumbnail_src']
184
185         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
186             webpage, u'title')
187
188         info = {
189             'id': video_id,
190             'title': video_title,
191             'url': video_url,
192             'ext': 'mp4',
193             'duration': video_duration,
194             'thumbnail': thumbnail,
195         }
196         return [info]
197
198
199
200
201
202
203
204 class EscapistIE(InfoExtractor):
205     """Information extractor for The Escapist """
206
207     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
208     IE_NAME = u'escapist'
209
210     def _real_extract(self, url):
211         mobj = re.match(self._VALID_URL, url)
212         if mobj is None:
213             raise ExtractorError(u'Invalid URL: %s' % url)
214         showName = mobj.group('showname')
215         videoId = mobj.group('episode')
216
217         self.report_extraction(videoId)
218         webpage = self._download_webpage(url, videoId)
219
220         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
221             webpage, u'description', fatal=False)
222
223         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
224             webpage, u'thumbnail', fatal=False)
225
226         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
227             webpage, u'player url')
228
229         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
230             webpage, u'player url').split(' : ')[-1]
231
232         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
233         configUrl = compat_urllib_parse.unquote(configUrl)
234
235         configJSON = self._download_webpage(configUrl, videoId,
236                                             u'Downloading configuration',
237                                             u'unable to download configuration')
238
239         # Technically, it's JavaScript, not JSON
240         configJSON = configJSON.replace("'", '"')
241
242         try:
243             config = json.loads(configJSON)
244         except (ValueError,) as err:
245             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
246
247         playlist = config['playlist']
248         videoUrl = playlist[1]['url']
249
250         info = {
251             'id': videoId,
252             'url': videoUrl,
253             'uploader': showName,
254             'upload_date': None,
255             'title': title,
256             'ext': 'mp4',
257             'thumbnail': imgUrl,
258             'description': videoDesc,
259             'player_url': playerUrl,
260         }
261
262         return [info]
263
264 class CollegeHumorIE(InfoExtractor):
265     """Information extractor for collegehumor.com"""
266
267     _WORKING = False
268     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
269     IE_NAME = u'collegehumor'
270
271     def report_manifest(self, video_id):
272         """Report information extraction."""
273         self.to_screen(u'%s: Downloading XML manifest' % video_id)
274
275     def _real_extract(self, url):
276         mobj = re.match(self._VALID_URL, url)
277         if mobj is None:
278             raise ExtractorError(u'Invalid URL: %s' % url)
279         video_id = mobj.group('videoid')
280
281         info = {
282             'id': video_id,
283             'uploader': None,
284             'upload_date': None,
285         }
286
287         self.report_extraction(video_id)
288         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
289         try:
290             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
293
294         mdoc = xml.etree.ElementTree.fromstring(metaXml)
295         try:
296             videoNode = mdoc.findall('./video')[0]
297             info['description'] = videoNode.findall('./description')[0].text
298             info['title'] = videoNode.findall('./caption')[0].text
299             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
300             manifest_url = videoNode.findall('./file')[0].text
301         except IndexError:
302             raise ExtractorError(u'Invalid metadata XML file')
303
304         manifest_url += '?hdcore=2.10.3'
305         self.report_manifest(video_id)
306         try:
307             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
309             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
310
311         adoc = xml.etree.ElementTree.fromstring(manifestXml)
312         try:
313             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
314             node_id = media_node.attrib['url']
315             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
316         except IndexError as err:
317             raise ExtractorError(u'Invalid manifest file')
318
319         url_pr = compat_urllib_parse_urlparse(manifest_url)
320         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
321
322         info['url'] = url
323         info['ext'] = 'f4f'
324         return [info]
325
326
327 class XVideosIE(InfoExtractor):
328     """Information extractor for xvideos.com"""
329
330     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
331     IE_NAME = u'xvideos'
332
333     def _real_extract(self, url):
334         mobj = re.match(self._VALID_URL, url)
335         if mobj is None:
336             raise ExtractorError(u'Invalid URL: %s' % url)
337         video_id = mobj.group(1)
338
339         webpage = self._download_webpage(url, video_id)
340
341         self.report_extraction(video_id)
342
343         # Extract video URL
344         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
345             webpage, u'video URL'))
346
347         # Extract title
348         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
349             webpage, u'title')
350
351         # Extract video thumbnail
352         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
353             webpage, u'thumbnail', fatal=False)
354
355         info = {
356             'id': video_id,
357             'url': video_url,
358             'uploader': None,
359             'upload_date': None,
360             'title': video_title,
361             'ext': 'flv',
362             'thumbnail': video_thumbnail,
363             'description': None,
364         }
365
366         return [info]
367
368
369 class SoundcloudIE(InfoExtractor):
370     """Information extractor for soundcloud.com
371        To access the media, the uid of the song and a stream token
372        must be extracted from the page source and the script must make
373        a request to media.soundcloud.com/crossdomain.xml. Then
374        the media can be grabbed by requesting from an url composed
375        of the stream token and uid
376      """
377
378     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
379     IE_NAME = u'soundcloud'
380
381     def report_resolve(self, video_id):
382         """Report information extraction."""
383         self.to_screen(u'%s: Resolving id' % video_id)
384
385     def _real_extract(self, url):
386         mobj = re.match(self._VALID_URL, url)
387         if mobj is None:
388             raise ExtractorError(u'Invalid URL: %s' % url)
389
390         # extract uploader (which is in the url)
391         uploader = mobj.group(1)
392         # extract simple title (uploader + slug of song title)
393         slug_title =  mobj.group(2)
394         simple_title = uploader + u'-' + slug_title
395         full_title = '%s/%s' % (uploader, slug_title)
396
397         self.report_resolve(full_title)
398
399         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
400         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
401         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
402
403         info = json.loads(info_json)
404         video_id = info['id']
405         self.report_extraction(full_title)
406
407         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
408         stream_json = self._download_webpage(streams_url, full_title,
409                                              u'Downloading stream definitions',
410                                              u'unable to download stream definitions')
411
412         streams = json.loads(stream_json)
413         mediaURL = streams['http_mp3_128_url']
414         upload_date = unified_strdate(info['created_at'])
415
416         return [{
417             'id':       info['id'],
418             'url':      mediaURL,
419             'uploader': info['user']['username'],
420             'upload_date': upload_date,
421             'title':    info['title'],
422             'ext':      u'mp3',
423             'description': info['description'],
424         }]
425
426 class SoundcloudSetIE(InfoExtractor):
427     """Information extractor for soundcloud.com sets
428        To access the media, the uid of the song and a stream token
429        must be extracted from the page source and the script must make
430        a request to media.soundcloud.com/crossdomain.xml. Then
431        the media can be grabbed by requesting from an url composed
432        of the stream token and uid
433      """
434
435     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
436     IE_NAME = u'soundcloud:set'
437
438     def report_resolve(self, video_id):
439         """Report information extraction."""
440         self.to_screen(u'%s: Resolving id' % video_id)
441
442     def _real_extract(self, url):
443         mobj = re.match(self._VALID_URL, url)
444         if mobj is None:
445             raise ExtractorError(u'Invalid URL: %s' % url)
446
447         # extract uploader (which is in the url)
448         uploader = mobj.group(1)
449         # extract simple title (uploader + slug of song title)
450         slug_title =  mobj.group(2)
451         simple_title = uploader + u'-' + slug_title
452         full_title = '%s/sets/%s' % (uploader, slug_title)
453
454         self.report_resolve(full_title)
455
456         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
457         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
458         info_json = self._download_webpage(resolv_url, full_title)
459
460         videos = []
461         info = json.loads(info_json)
462         if 'errors' in info:
463             for err in info['errors']:
464                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
465             return
466
467         self.report_extraction(full_title)
468         for track in info['tracks']:
469             video_id = track['id']
470
471             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
472             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
473
474             self.report_extraction(video_id)
475             streams = json.loads(stream_json)
476             mediaURL = streams['http_mp3_128_url']
477
478             videos.append({
479                 'id':       video_id,
480                 'url':      mediaURL,
481                 'uploader': track['user']['username'],
482                 'upload_date':  unified_strdate(track['created_at']),
483                 'title':    track['title'],
484                 'ext':      u'mp3',
485                 'description': track['description'],
486             })
487         return videos
488
489
490 class InfoQIE(InfoExtractor):
491     """Information extractor for infoq.com"""
492     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
493
494     def _real_extract(self, url):
495         mobj = re.match(self._VALID_URL, url)
496         if mobj is None:
497             raise ExtractorError(u'Invalid URL: %s' % url)
498
499         webpage = self._download_webpage(url, video_id=url)
500         self.report_extraction(url)
501
502         # Extract video URL
503         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
504         if mobj is None:
505             raise ExtractorError(u'Unable to extract video url')
506         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
507         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
508
509         # Extract title
510         video_title = self._search_regex(r'contentTitle = "(.*?)";',
511             webpage, u'title')
512
513         # Extract description
514         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
515             webpage, u'description', fatal=False)
516
517         video_filename = video_url.split('/')[-1]
518         video_id, extension = video_filename.split('.')
519
520         info = {
521             'id': video_id,
522             'url': video_url,
523             'uploader': None,
524             'upload_date': None,
525             'title': video_title,
526             'ext': extension, # Extension is always(?) mp4, but seems to be flv
527             'thumbnail': None,
528             'description': video_description,
529         }
530
531         return [info]
532
533 class MixcloudIE(InfoExtractor):
534     """Information extractor for www.mixcloud.com"""
535
536     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
537     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
538     IE_NAME = u'mixcloud'
539
540     def report_download_json(self, file_id):
541         """Report JSON download."""
542         self.to_screen(u'Downloading json')
543
544     def get_urls(self, jsonData, fmt, bitrate='best'):
545         """Get urls from 'audio_formats' section in json"""
546         file_url = None
547         try:
548             bitrate_list = jsonData[fmt]
549             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
550                 bitrate = max(bitrate_list) # select highest
551
552             url_list = jsonData[fmt][bitrate]
553         except TypeError: # we have no bitrate info.
554             url_list = jsonData[fmt]
555         return url_list
556
557     def check_urls(self, url_list):
558         """Returns 1st active url from list"""
559         for url in url_list:
560             try:
561                 compat_urllib_request.urlopen(url)
562                 return url
563             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
564                 url = None
565
566         return None
567
568     def _print_formats(self, formats):
569         print('Available formats:')
570         for fmt in formats.keys():
571             for b in formats[fmt]:
572                 try:
573                     ext = formats[fmt][b][0]
574                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
575                 except TypeError: # we have no bitrate info
576                     ext = formats[fmt][0]
577                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
578                     break
579
580     def _real_extract(self, url):
581         mobj = re.match(self._VALID_URL, url)
582         if mobj is None:
583             raise ExtractorError(u'Invalid URL: %s' % url)
584         # extract uploader & filename from url
585         uploader = mobj.group(1).decode('utf-8')
586         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
587
588         # construct API request
589         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
590         # retrieve .json file with links to files
591         request = compat_urllib_request.Request(file_url)
592         try:
593             self.report_download_json(file_url)
594             jsonData = compat_urllib_request.urlopen(request).read()
595         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
596             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
597
598         # parse JSON
599         json_data = json.loads(jsonData)
600         player_url = json_data['player_swf_url']
601         formats = dict(json_data['audio_formats'])
602
603         req_format = self._downloader.params.get('format', None)
604         bitrate = None
605
606         if self._downloader.params.get('listformats', None):
607             self._print_formats(formats)
608             return
609
610         if req_format is None or req_format == 'best':
611             for format_param in formats.keys():
612                 url_list = self.get_urls(formats, format_param)
613                 # check urls
614                 file_url = self.check_urls(url_list)
615                 if file_url is not None:
616                     break # got it!
617         else:
618             if req_format not in formats:
619                 raise ExtractorError(u'Format is not available')
620
621             url_list = self.get_urls(formats, req_format)
622             file_url = self.check_urls(url_list)
623             format_param = req_format
624
625         return [{
626             'id': file_id.decode('utf-8'),
627             'url': file_url.decode('utf-8'),
628             'uploader': uploader.decode('utf-8'),
629             'upload_date': None,
630             'title': json_data['name'],
631             'ext': file_url.split('.')[-1].decode('utf-8'),
632             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
633             'thumbnail': json_data['thumbnail_url'],
634             'description': json_data['description'],
635             'player_url': player_url.decode('utf-8'),
636         }]
637
638 class StanfordOpenClassroomIE(InfoExtractor):
639     """Information extractor for Stanford's Open ClassRoom"""
640
641     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
642     IE_NAME = u'stanfordoc'
643
644     def _real_extract(self, url):
645         mobj = re.match(self._VALID_URL, url)
646         if mobj is None:
647             raise ExtractorError(u'Invalid URL: %s' % url)
648
649         if mobj.group('course') and mobj.group('video'): # A specific video
650             course = mobj.group('course')
651             video = mobj.group('video')
652             info = {
653                 'id': course + '_' + video,
654                 'uploader': None,
655                 'upload_date': None,
656             }
657
658             self.report_extraction(info['id'])
659             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
660             xmlUrl = baseUrl + video + '.xml'
661             try:
662                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
663             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
665             mdoc = xml.etree.ElementTree.fromstring(metaXml)
666             try:
667                 info['title'] = mdoc.findall('./title')[0].text
668                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
669             except IndexError:
670                 raise ExtractorError(u'Invalid metadata XML file')
671             info['ext'] = info['url'].rpartition('.')[2]
672             return [info]
673         elif mobj.group('course'): # A course page
674             course = mobj.group('course')
675             info = {
676                 'id': course,
677                 'type': 'playlist',
678                 'uploader': None,
679                 'upload_date': None,
680             }
681
682             coursepage = self._download_webpage(url, info['id'],
683                                         note='Downloading course info page',
684                                         errnote='Unable to download course info page')
685
686             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
687
688             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
689                 coursepage, u'description', fatal=False)
690
691             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
692             info['list'] = [
693                 {
694                     'type': 'reference',
695                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
696                 }
697                     for vpage in links]
698             results = []
699             for entry in info['list']:
700                 assert entry['type'] == 'reference'
701                 results += self.extract(entry['url'])
702             return results
703         else: # Root page
704             info = {
705                 'id': 'Stanford OpenClassroom',
706                 'type': 'playlist',
707                 'uploader': None,
708                 'upload_date': None,
709             }
710
711             self.report_download_webpage(info['id'])
712             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
713             try:
714                 rootpage = compat_urllib_request.urlopen(rootURL).read()
715             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
717
718             info['title'] = info['id']
719
720             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
721             info['list'] = [
722                 {
723                     'type': 'reference',
724                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
725                 }
726                     for cpage in links]
727
728             results = []
729             for entry in info['list']:
730                 assert entry['type'] == 'reference'
731                 results += self.extract(entry['url'])
732             return results
733
734 class MTVIE(InfoExtractor):
735     """Information extractor for MTV.com"""
736
737     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
738     IE_NAME = u'mtv'
739
740     def _real_extract(self, url):
741         mobj = re.match(self._VALID_URL, url)
742         if mobj is None:
743             raise ExtractorError(u'Invalid URL: %s' % url)
744         if not mobj.group('proto'):
745             url = 'http://' + url
746         video_id = mobj.group('videoid')
747
748         webpage = self._download_webpage(url, video_id)
749
750         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
751             webpage, u'song name', fatal=False)
752
753         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
754             webpage, u'title')
755
756         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
757             webpage, u'mtvn_uri', fatal=False)
758
759         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
760             webpage, u'content id', fatal=False)
761
762         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
763         self.report_extraction(video_id)
764         request = compat_urllib_request.Request(videogen_url)
765         try:
766             metadataXml = compat_urllib_request.urlopen(request).read()
767         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
769
770         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
771         renditions = mdoc.findall('.//rendition')
772
773         # For now, always pick the highest quality.
774         rendition = renditions[-1]
775
776         try:
777             _,_,ext = rendition.attrib['type'].partition('/')
778             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
779             video_url = rendition.find('./src').text
780         except KeyError:
781             raise ExtractorError('Invalid rendition field.')
782
783         info = {
784             'id': video_id,
785             'url': video_url,
786             'uploader': performer,
787             'upload_date': None,
788             'title': video_title,
789             'ext': ext,
790             'format': format,
791         }
792
793         return [info]
794
795
796 class YoukuIE(InfoExtractor):
797     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
798
799     def _gen_sid(self):
800         nowTime = int(time.time() * 1000)
801         random1 = random.randint(1000,1998)
802         random2 = random.randint(1000,9999)
803
804         return "%d%d%d" %(nowTime,random1,random2)
805
806     def _get_file_ID_mix_string(self, seed):
807         mixed = []
808         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
809         seed = float(seed)
810         for i in range(len(source)):
811             seed  =  (seed * 211 + 30031 ) % 65536
812             index  =  math.floor(seed / 65536 * len(source) )
813             mixed.append(source[int(index)])
814             source.remove(source[int(index)])
815         #return ''.join(mixed)
816         return mixed
817
818     def _get_file_id(self, fileId, seed):
819         mixed = self._get_file_ID_mix_string(seed)
820         ids = fileId.split('*')
821         realId = []
822         for ch in ids:
823             if ch:
824                 realId.append(mixed[int(ch)])
825         return ''.join(realId)
826
827     def _real_extract(self, url):
828         mobj = re.match(self._VALID_URL, url)
829         if mobj is None:
830             raise ExtractorError(u'Invalid URL: %s' % url)
831         video_id = mobj.group('ID')
832
833         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
834
835         jsondata = self._download_webpage(info_url, video_id)
836
837         self.report_extraction(video_id)
838         try:
839             config = json.loads(jsondata)
840
841             video_title =  config['data'][0]['title']
842             seed = config['data'][0]['seed']
843
844             format = self._downloader.params.get('format', None)
845             supported_format = list(config['data'][0]['streamfileids'].keys())
846
847             if format is None or format == 'best':
848                 if 'hd2' in supported_format:
849                     format = 'hd2'
850                 else:
851                     format = 'flv'
852                 ext = u'flv'
853             elif format == 'worst':
854                 format = 'mp4'
855                 ext = u'mp4'
856             else:
857                 format = 'flv'
858                 ext = u'flv'
859
860
861             fileid = config['data'][0]['streamfileids'][format]
862             keys = [s['k'] for s in config['data'][0]['segs'][format]]
863         except (UnicodeDecodeError, ValueError, KeyError):
864             raise ExtractorError(u'Unable to extract info section')
865
866         files_info=[]
867         sid = self._gen_sid()
868         fileid = self._get_file_id(fileid, seed)
869
870         #column 8,9 of fileid represent the segment number
871         #fileid[7:9] should be changed
872         for index, key in enumerate(keys):
873
874             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
875             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
876
877             info = {
878                 'id': '%s_part%02d' % (video_id, index),
879                 'url': download_url,
880                 'uploader': None,
881                 'upload_date': None,
882                 'title': video_title,
883                 'ext': ext,
884             }
885             files_info.append(info)
886
887         return files_info
888
889
890 class XNXXIE(InfoExtractor):
891     """Information extractor for xnxx.com"""
892
893     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
894     IE_NAME = u'xnxx'
895     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
896     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
897     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
898
899     def _real_extract(self, url):
900         mobj = re.match(self._VALID_URL, url)
901         if mobj is None:
902             raise ExtractorError(u'Invalid URL: %s' % url)
903         video_id = mobj.group(1)
904
905         # Get webpage content
906         webpage = self._download_webpage(url, video_id)
907
908         video_url = self._search_regex(self.VIDEO_URL_RE,
909             webpage, u'video URL')
910         video_url = compat_urllib_parse.unquote(video_url)
911
912         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
913             webpage, u'title')
914
915         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
916             webpage, u'thumbnail', fatal=False)
917
918         return [{
919             'id': video_id,
920             'url': video_url,
921             'uploader': None,
922             'upload_date': None,
923             'title': video_title,
924             'ext': 'flv',
925             'thumbnail': video_thumbnail,
926             'description': None,
927         }]
928
929
930
931 class NBAIE(InfoExtractor):
932     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
933     IE_NAME = u'nba'
934
935     def _real_extract(self, url):
936         mobj = re.match(self._VALID_URL, url)
937         if mobj is None:
938             raise ExtractorError(u'Invalid URL: %s' % url)
939
940         video_id = mobj.group(1)
941
942         webpage = self._download_webpage(url, video_id)
943
944         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
945
946         shortened_video_id = video_id.rpartition('/')[2]
947         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
948             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
949
950         # It isn't there in the HTML it returns to us
951         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
952
953         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
954
955         info = {
956             'id': shortened_video_id,
957             'url': video_url,
958             'ext': 'mp4',
959             'title': title,
960             # 'uploader_date': uploader_date,
961             'description': description,
962         }
963         return [info]
964
965 class JustinTVIE(InfoExtractor):
966     """Information extractor for justin.tv and twitch.tv"""
967     # TODO: One broadcast may be split into multiple videos. The key
968     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
969     # starts at 1 and increases. Can we treat all parts as one video?
970
971     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
972         (?:
973             (?P<channelid>[^/]+)|
974             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
975             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
976         )
977         /?(?:\#.*)?$
978         """
979     _JUSTIN_PAGE_LIMIT = 100
980     IE_NAME = u'justin.tv'
981
982     def report_download_page(self, channel, offset):
983         """Report attempt to download a single page of videos."""
984         self.to_screen(u'%s: Downloading video information from %d to %d' %
985                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
986
987     # Return count of items, list of *valid* items
988     def _parse_page(self, url, video_id):
989         webpage = self._download_webpage(url, video_id,
990                                          u'Downloading video info JSON',
991                                          u'unable to download video info JSON')
992
993         response = json.loads(webpage)
994         if type(response) != list:
995             error_text = response.get('error', 'unknown error')
996             raise ExtractorError(u'Justin.tv API: %s' % error_text)
997         info = []
998         for clip in response:
999             video_url = clip['video_file_url']
1000             if video_url:
1001                 video_extension = os.path.splitext(video_url)[1][1:]
1002                 video_date = re.sub('-', '', clip['start_time'][:10])
1003                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1004                 video_id = clip['id']
1005                 video_title = clip.get('title', video_id)
1006                 info.append({
1007                     'id': video_id,
1008                     'url': video_url,
1009                     'title': video_title,
1010                     'uploader': clip.get('channel_name', video_uploader_id),
1011                     'uploader_id': video_uploader_id,
1012                     'upload_date': video_date,
1013                     'ext': video_extension,
1014                 })
1015         return (len(response), info)
1016
1017     def _real_extract(self, url):
1018         mobj = re.match(self._VALID_URL, url)
1019         if mobj is None:
1020             raise ExtractorError(u'invalid URL: %s' % url)
1021
1022         api_base = 'http://api.justin.tv'
1023         paged = False
1024         if mobj.group('channelid'):
1025             paged = True
1026             video_id = mobj.group('channelid')
1027             api = api_base + '/channel/archives/%s.json' % video_id
1028         elif mobj.group('chapterid'):
1029             chapter_id = mobj.group('chapterid')
1030
1031             webpage = self._download_webpage(url, chapter_id)
1032             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1033             if not m:
1034                 raise ExtractorError(u'Cannot find archive of a chapter')
1035             archive_id = m.group(1)
1036
1037             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1038             chapter_info_xml = self._download_webpage(api, chapter_id,
1039                                              note=u'Downloading chapter information',
1040                                              errnote=u'Chapter information download failed')
1041             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1042             for a in doc.findall('.//archive'):
1043                 if archive_id == a.find('./id').text:
1044                     break
1045             else:
1046                 raise ExtractorError(u'Could not find chapter in chapter information')
1047
1048             video_url = a.find('./video_file_url').text
1049             video_ext = video_url.rpartition('.')[2] or u'flv'
1050
1051             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1052             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1053                                    note='Downloading chapter metadata',
1054                                    errnote='Download of chapter metadata failed')
1055             chapter_info = json.loads(chapter_info_json)
1056
1057             bracket_start = int(doc.find('.//bracket_start').text)
1058             bracket_end = int(doc.find('.//bracket_end').text)
1059
1060             # TODO determine start (and probably fix up file)
1061             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1062             #video_url += u'?start=' + TODO:start_timestamp
1063             # bracket_start is 13290, but we want 51670615
1064             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1065                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1066
1067             info = {
1068                 'id': u'c' + chapter_id,
1069                 'url': video_url,
1070                 'ext': video_ext,
1071                 'title': chapter_info['title'],
1072                 'thumbnail': chapter_info['preview'],
1073                 'description': chapter_info['description'],
1074                 'uploader': chapter_info['channel']['display_name'],
1075                 'uploader_id': chapter_info['channel']['name'],
1076             }
1077             return [info]
1078         else:
1079             video_id = mobj.group('videoid')
1080             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1081
1082         self.report_extraction(video_id)
1083
1084         info = []
1085         offset = 0
1086         limit = self._JUSTIN_PAGE_LIMIT
1087         while True:
1088             if paged:
1089                 self.report_download_page(video_id, offset)
1090             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1091             page_count, page_info = self._parse_page(page_url, video_id)
1092             info.extend(page_info)
1093             if not paged or page_count != limit:
1094                 break
1095             offset += limit
1096         return info
1097
1098 class FunnyOrDieIE(InfoExtractor):
1099     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1100
1101     def _real_extract(self, url):
1102         mobj = re.match(self._VALID_URL, url)
1103         if mobj is None:
1104             raise ExtractorError(u'invalid URL: %s' % url)
1105
1106         video_id = mobj.group('id')
1107         webpage = self._download_webpage(url, video_id)
1108
1109         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1110             webpage, u'video URL', flags=re.DOTALL)
1111
1112         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1113             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1114
1115         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1116             webpage, u'description', fatal=False, flags=re.DOTALL)
1117
1118         info = {
1119             'id': video_id,
1120             'url': video_url,
1121             'ext': 'mp4',
1122             'title': title,
1123             'description': video_description,
1124         }
1125         return [info]
1126
1127 class SteamIE(InfoExtractor):
1128     _VALID_URL = r"""http://store\.steampowered\.com/
1129                 (agecheck/)?
1130                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1131                 (?P<gameID>\d+)/?
1132                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1133                 """
1134     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1135     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1136
1137     @classmethod
1138     def suitable(cls, url):
1139         """Receives a URL and returns True if suitable for this IE."""
1140         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1141
1142     def _real_extract(self, url):
1143         m = re.match(self._VALID_URL, url, re.VERBOSE)
1144         gameID = m.group('gameID')
1145
1146         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1147         webpage = self._download_webpage(videourl, gameID)
1148
1149         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1150             videourl = self._AGECHECK_TEMPLATE % gameID
1151             self.report_age_confirmation()
1152             webpage = self._download_webpage(videourl, gameID)
1153
1154         self.report_extraction(gameID)
1155         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1156                                              webpage, 'game title')
1157
1158         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1159         mweb = re.finditer(urlRE, webpage)
1160         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1161         titles = re.finditer(namesRE, webpage)
1162         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1163         thumbs = re.finditer(thumbsRE, webpage)
1164         videos = []
1165         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1166             video_id = vid.group('videoID')
1167             title = vtitle.group('videoName')
1168             video_url = vid.group('videoURL')
1169             video_thumb = thumb.group('thumbnail')
1170             if not video_url:
1171                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1172             info = {
1173                 'id':video_id,
1174                 'url':video_url,
1175                 'ext': 'flv',
1176                 'title': unescapeHTML(title),
1177                 'thumbnail': video_thumb
1178                   }
1179             videos.append(info)
1180         return [self.playlist_result(videos, gameID, game_title)]
1181
1182 class UstreamIE(InfoExtractor):
1183     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1184     IE_NAME = u'ustream'
1185
1186     def _real_extract(self, url):
1187         m = re.match(self._VALID_URL, url)
1188         video_id = m.group('videoID')
1189
1190         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1191         webpage = self._download_webpage(url, video_id)
1192
1193         self.report_extraction(video_id)
1194
1195         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1196             webpage, u'title')
1197
1198         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1199             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1200
1201         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1202             webpage, u'thumbnail', fatal=False)
1203
1204         info = {
1205                 'id': video_id,
1206                 'url': video_url,
1207                 'ext': 'flv',
1208                 'title': video_title,
1209                 'uploader': uploader,
1210                 'thumbnail': thumbnail,
1211                }
1212         return info
1213
1214 class WorldStarHipHopIE(InfoExtractor):
1215     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1216     IE_NAME = u'WorldStarHipHop'
1217
1218     def _real_extract(self, url):
1219         m = re.match(self._VALID_URL, url)
1220         video_id = m.group('id')
1221
1222         webpage_src = self._download_webpage(url, video_id)
1223
1224         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1225             webpage_src, u'video URL')
1226
1227         if 'mp4' in video_url:
1228             ext = 'mp4'
1229         else:
1230             ext = 'flv'
1231
1232         video_title = self._html_search_regex(r"<title>(.*)</title>",
1233             webpage_src, u'title')
1234
1235         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1236         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1237             webpage_src, u'thumbnail', fatal=False)
1238
1239         if not thumbnail:
1240             _title = r"""candytitles.*>(.*)</span>"""
1241             mobj = re.search(_title, webpage_src)
1242             if mobj is not None:
1243                 video_title = mobj.group(1)
1244
1245         results = [{
1246                     'id': video_id,
1247                     'url' : video_url,
1248                     'title' : video_title,
1249                     'thumbnail' : thumbnail,
1250                     'ext' : ext,
1251                     }]
1252         return results
1253
1254 class RBMARadioIE(InfoExtractor):
1255     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1256
1257     def _real_extract(self, url):
1258         m = re.match(self._VALID_URL, url)
1259         video_id = m.group('videoID')
1260
1261         webpage = self._download_webpage(url, video_id)
1262
1263         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1264             webpage, u'json data', flags=re.MULTILINE)
1265
1266         try:
1267             data = json.loads(json_data)
1268         except ValueError as e:
1269             raise ExtractorError(u'Invalid JSON: ' + str(e))
1270
1271         video_url = data['akamai_url'] + '&cbr=256'
1272         url_parts = compat_urllib_parse_urlparse(video_url)
1273         video_ext = url_parts.path.rpartition('.')[2]
1274         info = {
1275                 'id': video_id,
1276                 'url': video_url,
1277                 'ext': video_ext,
1278                 'title': data['title'],
1279                 'description': data.get('teaser_text'),
1280                 'location': data.get('country_of_origin'),
1281                 'uploader': data.get('host', {}).get('name'),
1282                 'uploader_id': data.get('host', {}).get('slug'),
1283                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1284                 'duration': data.get('duration'),
1285         }
1286         return [info]
1287
1288
1289 class YouPornIE(InfoExtractor):
1290     """Information extractor for youporn.com."""
1291     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1292
1293     def _print_formats(self, formats):
1294         """Print all available formats"""
1295         print(u'Available formats:')
1296         print(u'ext\t\tformat')
1297         print(u'---------------------------------')
1298         for format in formats:
1299             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1300
1301     def _specific(self, req_format, formats):
1302         for x in formats:
1303             if(x["format"]==req_format):
1304                 return x
1305         return None
1306
1307     def _real_extract(self, url):
1308         mobj = re.match(self._VALID_URL, url)
1309         if mobj is None:
1310             raise ExtractorError(u'Invalid URL: %s' % url)
1311         video_id = mobj.group('videoid')
1312
1313         req = compat_urllib_request.Request(url)
1314         req.add_header('Cookie', 'age_verified=1')
1315         webpage = self._download_webpage(req, video_id)
1316
1317         # Get JSON parameters
1318         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1319         try:
1320             params = json.loads(json_params)
1321         except:
1322             raise ExtractorError(u'Invalid JSON')
1323
1324         self.report_extraction(video_id)
1325         try:
1326             video_title = params['title']
1327             upload_date = unified_strdate(params['release_date_f'])
1328             video_description = params['description']
1329             video_uploader = params['submitted_by']
1330             thumbnail = params['thumbnails'][0]['image']
1331         except KeyError:
1332             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1333
1334         # Get all of the formats available
1335         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1336         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1337             webpage, u'download list').strip()
1338
1339         # Get all of the links from the page
1340         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1341         links = re.findall(LINK_RE, download_list_html)
1342         if(len(links) == 0):
1343             raise ExtractorError(u'ERROR: no known formats available for video')
1344
1345         self.to_screen(u'Links found: %d' % len(links))
1346
1347         formats = []
1348         for link in links:
1349
1350             # A link looks like this:
1351             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1352             # A path looks like this:
1353             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1354             video_url = unescapeHTML( link )
1355             path = compat_urllib_parse_urlparse( video_url ).path
1356             extension = os.path.splitext( path )[1][1:]
1357             format = path.split('/')[4].split('_')[:2]
1358             size = format[0]
1359             bitrate = format[1]
1360             format = "-".join( format )
1361             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1362
1363             formats.append({
1364                 'id': video_id,
1365                 'url': video_url,
1366                 'uploader': video_uploader,
1367                 'upload_date': upload_date,
1368                 'title': video_title,
1369                 'ext': extension,
1370                 'format': format,
1371                 'thumbnail': thumbnail,
1372                 'description': video_description
1373             })
1374
1375         if self._downloader.params.get('listformats', None):
1376             self._print_formats(formats)
1377             return
1378
1379         req_format = self._downloader.params.get('format', None)
1380         self.to_screen(u'Format: %s' % req_format)
1381
1382         if req_format is None or req_format == 'best':
1383             return [formats[0]]
1384         elif req_format == 'worst':
1385             return [formats[-1]]
1386         elif req_format in ('-1', 'all'):
1387             return formats
1388         else:
1389             format = self._specific( req_format, formats )
1390             if result is None:
1391                 raise ExtractorError(u'Requested format not available')
1392             return [format]
1393
1394
1395
1396 class PornotubeIE(InfoExtractor):
1397     """Information extractor for pornotube.com."""
1398     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1399
1400     def _real_extract(self, url):
1401         mobj = re.match(self._VALID_URL, url)
1402         if mobj is None:
1403             raise ExtractorError(u'Invalid URL: %s' % url)
1404
1405         video_id = mobj.group('videoid')
1406         video_title = mobj.group('title')
1407
1408         # Get webpage content
1409         webpage = self._download_webpage(url, video_id)
1410
1411         # Get the video URL
1412         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1413         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1414         video_url = compat_urllib_parse.unquote(video_url)
1415
1416         #Get the uploaded date
1417         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1418         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1419         if upload_date: upload_date = unified_strdate(upload_date)
1420
1421         info = {'id': video_id,
1422                 'url': video_url,
1423                 'uploader': None,
1424                 'upload_date': upload_date,
1425                 'title': video_title,
1426                 'ext': 'flv',
1427                 'format': 'flv'}
1428
1429         return [info]
1430
1431 class YouJizzIE(InfoExtractor):
1432     """Information extractor for youjizz.com."""
1433     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1434
1435     def _real_extract(self, url):
1436         mobj = re.match(self._VALID_URL, url)
1437         if mobj is None:
1438             raise ExtractorError(u'Invalid URL: %s' % url)
1439
1440         video_id = mobj.group('videoid')
1441
1442         # Get webpage content
1443         webpage = self._download_webpage(url, video_id)
1444
1445         # Get the video title
1446         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1447             webpage, u'title').strip()
1448
1449         # Get the embed page
1450         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1451         if result is None:
1452             raise ExtractorError(u'ERROR: unable to extract embed page')
1453
1454         embed_page_url = result.group(0).strip()
1455         video_id = result.group('videoid')
1456
1457         webpage = self._download_webpage(embed_page_url, video_id)
1458
1459         # Get the video URL
1460         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1461             webpage, u'video URL')
1462
1463         info = {'id': video_id,
1464                 'url': video_url,
1465                 'title': video_title,
1466                 'ext': 'flv',
1467                 'format': 'flv',
1468                 'player_url': embed_page_url}
1469
1470         return [info]
1471
1472 class EightTracksIE(InfoExtractor):
1473     IE_NAME = '8tracks'
1474     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1475
1476     def _real_extract(self, url):
1477         mobj = re.match(self._VALID_URL, url)
1478         if mobj is None:
1479             raise ExtractorError(u'Invalid URL: %s' % url)
1480         playlist_id = mobj.group('id')
1481
1482         webpage = self._download_webpage(url, playlist_id)
1483
1484         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1485         data = json.loads(json_like)
1486
1487         session = str(random.randint(0, 1000000000))
1488         mix_id = data['id']
1489         track_count = data['tracks_count']
1490         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1491         next_url = first_url
1492         res = []
1493         for i in itertools.count():
1494             api_json = self._download_webpage(next_url, playlist_id,
1495                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1496                 errnote=u'Failed to download song information')
1497             api_data = json.loads(api_json)
1498             track_data = api_data[u'set']['track']
1499             info = {
1500                 'id': track_data['id'],
1501                 'url': track_data['track_file_stream_url'],
1502                 'title': track_data['performer'] + u' - ' + track_data['name'],
1503                 'raw_title': track_data['name'],
1504                 'uploader_id': data['user']['login'],
1505                 'ext': 'm4a',
1506             }
1507             res.append(info)
1508             if api_data['set']['at_last_track']:
1509                 break
1510             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1511         return res
1512
1513 class KeekIE(InfoExtractor):
1514     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1515     IE_NAME = u'keek'
1516
1517     def _real_extract(self, url):
1518         m = re.match(self._VALID_URL, url)
1519         video_id = m.group('videoID')
1520
1521         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1522         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1523         webpage = self._download_webpage(url, video_id)
1524
1525         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1526             webpage, u'title')
1527
1528         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1529             webpage, u'uploader', fatal=False)
1530
1531         info = {
1532                 'id': video_id,
1533                 'url': video_url,
1534                 'ext': 'mp4',
1535                 'title': video_title,
1536                 'thumbnail': thumbnail,
1537                 'uploader': uploader
1538         }
1539         return [info]
1540
1541 class TEDIE(InfoExtractor):
1542     _VALID_URL=r'''http://www\.ted\.com/
1543                    (
1544                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1545                         |
1546                         ((?P<type_talk>talks)) # We have a simple talk
1547                    )
1548                    (/lang/(.*?))? # The url may contain the language
1549                    /(?P<name>\w+) # Here goes the name and then ".html"
1550                    '''
1551
1552     @classmethod
1553     def suitable(cls, url):
1554         """Receives a URL and returns True if suitable for this IE."""
1555         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1556
1557     def _real_extract(self, url):
1558         m=re.match(self._VALID_URL, url, re.VERBOSE)
1559         if m.group('type_talk'):
1560             return [self._talk_info(url)]
1561         else :
1562             playlist_id=m.group('playlist_id')
1563             name=m.group('name')
1564             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1565             return [self._playlist_videos_info(url,name,playlist_id)]
1566
1567     def _playlist_videos_info(self,url,name,playlist_id=0):
1568         '''Returns the videos of the playlist'''
1569         video_RE=r'''
1570                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1571                      ([.\s]*?)data-playlist_item_id="(\d+)"
1572                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1573                      '''
1574         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1575         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1576         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1577         m_names=re.finditer(video_name_RE,webpage)
1578
1579         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1580                                                  webpage, 'playlist title')
1581
1582         playlist_entries = []
1583         for m_video, m_name in zip(m_videos,m_names):
1584             video_id=m_video.group('video_id')
1585             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1586             playlist_entries.append(self.url_result(talk_url, 'TED'))
1587         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1588
1589     def _talk_info(self, url, video_id=0):
1590         """Return the video for the talk in the url"""
1591         m = re.match(self._VALID_URL, url,re.VERBOSE)
1592         video_name = m.group('name')
1593         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1594         self.report_extraction(video_name)
1595         # If the url includes the language we get the title translated
1596         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1597                                         webpage, 'title')
1598         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1599                                     webpage, 'json data')
1600         info = json.loads(json_data)
1601         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1602                                        webpage, 'description', flags = re.DOTALL)
1603         
1604         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1605                                        webpage, 'thumbnail')
1606         info = {
1607                 'id': info['id'],
1608                 'url': info['htmlStreams'][-1]['file'],
1609                 'ext': 'mp4',
1610                 'title': title,
1611                 'thumbnail': thumbnail,
1612                 'description': desc,
1613                 }
1614         return info
1615
1616 class MySpassIE(InfoExtractor):
1617     _VALID_URL = r'http://www.myspass.de/.*'
1618
1619     def _real_extract(self, url):
1620         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1621
1622         # video id is the last path element of the URL
1623         # usually there is a trailing slash, so also try the second but last
1624         url_path = compat_urllib_parse_urlparse(url).path
1625         url_parent_path, video_id = os.path.split(url_path)
1626         if not video_id:
1627             _, video_id = os.path.split(url_parent_path)
1628
1629         # get metadata
1630         metadata_url = META_DATA_URL_TEMPLATE % video_id
1631         metadata_text = self._download_webpage(metadata_url, video_id)
1632         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1633
1634         # extract values from metadata
1635         url_flv_el = metadata.find('url_flv')
1636         if url_flv_el is None:
1637             raise ExtractorError(u'Unable to extract download url')
1638         video_url = url_flv_el.text
1639         extension = os.path.splitext(video_url)[1][1:]
1640         title_el = metadata.find('title')
1641         if title_el is None:
1642             raise ExtractorError(u'Unable to extract title')
1643         title = title_el.text
1644         format_id_el = metadata.find('format_id')
1645         if format_id_el is None:
1646             format = ext
1647         else:
1648             format = format_id_el.text
1649         description_el = metadata.find('description')
1650         if description_el is not None:
1651             description = description_el.text
1652         else:
1653             description = None
1654         imagePreview_el = metadata.find('imagePreview')
1655         if imagePreview_el is not None:
1656             thumbnail = imagePreview_el.text
1657         else:
1658             thumbnail = None
1659         info = {
1660             'id': video_id,
1661             'url': video_url,
1662             'title': title,
1663             'ext': extension,
1664             'format': format,
1665             'thumbnail': thumbnail,
1666             'description': description
1667         }
1668         return [info]
1669
1670 class SpiegelIE(InfoExtractor):
1671     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1672
1673     def _real_extract(self, url):
1674         m = re.match(self._VALID_URL, url)
1675         video_id = m.group('videoID')
1676
1677         webpage = self._download_webpage(url, video_id)
1678
1679         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1680             webpage, u'title')
1681
1682         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1683         xml_code = self._download_webpage(xml_url, video_id,
1684                     note=u'Downloading XML', errnote=u'Failed to download XML')
1685
1686         idoc = xml.etree.ElementTree.fromstring(xml_code)
1687         last_type = idoc[-1]
1688         filename = last_type.findall('./filename')[0].text
1689         duration = float(last_type.findall('./duration')[0].text)
1690
1691         video_url = 'http://video2.spiegel.de/flash/' + filename
1692         video_ext = filename.rpartition('.')[2]
1693         info = {
1694             'id': video_id,
1695             'url': video_url,
1696             'ext': video_ext,
1697             'title': video_title,
1698             'duration': duration,
1699         }
1700         return [info]
1701
1702 class LiveLeakIE(InfoExtractor):
1703
1704     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1705     IE_NAME = u'liveleak'
1706
1707     def _real_extract(self, url):
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             raise ExtractorError(u'Invalid URL: %s' % url)
1711
1712         video_id = mobj.group('video_id')
1713
1714         webpage = self._download_webpage(url, video_id)
1715
1716         video_url = self._search_regex(r'file: "(.*?)",',
1717             webpage, u'video URL')
1718
1719         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1720             webpage, u'title').replace('LiveLeak.com -', '').strip()
1721
1722         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1723             webpage, u'description', fatal=False)
1724
1725         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1726             webpage, u'uploader', fatal=False)
1727
1728         info = {
1729             'id':  video_id,
1730             'url': video_url,
1731             'ext': 'mp4',
1732             'title': video_title,
1733             'description': video_description,
1734             'uploader': video_uploader
1735         }
1736
1737         return [info]
1738
1739
1740
1741 class TumblrIE(InfoExtractor):
1742     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1743
1744     def _real_extract(self, url):
1745         m_url = re.match(self._VALID_URL, url)
1746         video_id = m_url.group('id')
1747         blog = m_url.group('blog_name')
1748
1749         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1750         webpage = self._download_webpage(url, video_id)
1751
1752         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1753         video = re.search(re_video, webpage)
1754         if video is None:
1755            raise ExtractorError(u'Unable to extract video')
1756         video_url = video.group('video_url')
1757         ext = video.group('ext')
1758
1759         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1760             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1761         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1762
1763         # The only place where you can get a title, it's not complete,
1764         # but searching in other places doesn't work for all videos
1765         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1766             webpage, u'title', flags=re.DOTALL)
1767
1768         return [{'id': video_id,
1769                  'url': video_url,
1770                  'title': video_title,
1771                  'thumbnail': video_thumbnail,
1772                  'ext': ext
1773                  }]
1774
1775 class BandcampIE(InfoExtractor):
1776     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1777
1778     def _real_extract(self, url):
1779         mobj = re.match(self._VALID_URL, url)
1780         title = mobj.group('title')
1781         webpage = self._download_webpage(url, title)
1782         # We get the link to the free download page
1783         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1784         if m_download is None:
1785             raise ExtractorError(u'No free songs found')
1786
1787         download_link = m_download.group(1)
1788         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1789                        webpage, re.MULTILINE|re.DOTALL).group('id')
1790
1791         download_webpage = self._download_webpage(download_link, id,
1792                                                   'Downloading free downloads page')
1793         # We get the dictionary of the track from some javascrip code
1794         info = re.search(r'items: (.*?),$',
1795                          download_webpage, re.MULTILINE).group(1)
1796         info = json.loads(info)[0]
1797         # We pick mp3-320 for now, until format selection can be easily implemented.
1798         mp3_info = info[u'downloads'][u'mp3-320']
1799         # If we try to use this url it says the link has expired
1800         initial_url = mp3_info[u'url']
1801         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1802         m_url = re.match(re_url, initial_url)
1803         #We build the url we will use to get the final track url
1804         # This url is build in Bandcamp in the script download_bunde_*.js
1805         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1806         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1807         # If we could correctly generate the .rand field the url would be
1808         #in the "download_url" key
1809         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1810
1811         track_info = {'id':id,
1812                       'title' : info[u'title'],
1813                       'ext' :   'mp3',
1814                       'url' :   final_url,
1815                       'thumbnail' : info[u'thumb_url'],
1816                       'uploader' :  info[u'artist']
1817                       }
1818
1819         return [track_info]
1820
1821 class RedTubeIE(InfoExtractor):
1822     """Information Extractor for redtube"""
1823     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1824
1825     def _real_extract(self,url):
1826         mobj = re.match(self._VALID_URL, url)
1827         if mobj is None:
1828             raise ExtractorError(u'Invalid URL: %s' % url)
1829
1830         video_id = mobj.group('id')
1831         video_extension = 'mp4'        
1832         webpage = self._download_webpage(url, video_id)
1833
1834         self.report_extraction(video_id)
1835
1836         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1837             webpage, u'video URL')
1838
1839         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1840             webpage, u'title')
1841
1842         return [{
1843             'id':       video_id,
1844             'url':      video_url,
1845             'ext':      video_extension,
1846             'title':    video_title,
1847         }]
1848         
1849 class InaIE(InfoExtractor):
1850     """Information Extractor for Ina.fr"""
1851     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1852
1853     def _real_extract(self,url):
1854         mobj = re.match(self._VALID_URL, url)
1855
1856         video_id = mobj.group('id')
1857         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1858         video_extension = 'mp4'
1859         webpage = self._download_webpage(mrss_url, video_id)
1860
1861         self.report_extraction(video_id)
1862
1863         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1864             webpage, u'video URL')
1865
1866         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1867             webpage, u'title')
1868
1869         return [{
1870             'id':       video_id,
1871             'url':      video_url,
1872             'ext':      video_extension,
1873             'title':    video_title,
1874         }]
1875
1876 class HowcastIE(InfoExtractor):
1877     """Information Extractor for Howcast.com"""
1878     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1879
1880     def _real_extract(self, url):
1881         mobj = re.match(self._VALID_URL, url)
1882
1883         video_id = mobj.group('id')
1884         webpage_url = 'http://www.howcast.com/videos/' + video_id
1885         webpage = self._download_webpage(webpage_url, video_id)
1886
1887         self.report_extraction(video_id)
1888
1889         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1890             webpage, u'video URL')
1891
1892         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1893             webpage, u'title')
1894
1895         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1896             webpage, u'description', fatal=False)
1897
1898         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1899             webpage, u'thumbnail', fatal=False)
1900
1901         return [{
1902             'id':       video_id,
1903             'url':      video_url,
1904             'ext':      'mp4',
1905             'title':    video_title,
1906             'description': video_description,
1907             'thumbnail': thumbnail,
1908         }]
1909
1910 class VineIE(InfoExtractor):
1911     """Information Extractor for Vine.co"""
1912     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1913
1914     def _real_extract(self, url):
1915         mobj = re.match(self._VALID_URL, url)
1916
1917         video_id = mobj.group('id')
1918         webpage_url = 'https://vine.co/v/' + video_id
1919         webpage = self._download_webpage(webpage_url, video_id)
1920
1921         self.report_extraction(video_id)
1922
1923         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1924             webpage, u'video URL')
1925
1926         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1927             webpage, u'title')
1928
1929         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1930             webpage, u'thumbnail', fatal=False)
1931
1932         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1933             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1934
1935         return [{
1936             'id':        video_id,
1937             'url':       video_url,
1938             'ext':       'mp4',
1939             'title':     video_title,
1940             'thumbnail': thumbnail,
1941             'uploader':  uploader,
1942         }]
1943
1944 class FlickrIE(InfoExtractor):
1945     """Information Extractor for Flickr videos"""
1946     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1947
1948     def _real_extract(self, url):
1949         mobj = re.match(self._VALID_URL, url)
1950
1951         video_id = mobj.group('id')
1952         video_uploader_id = mobj.group('uploader_id')
1953         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1954         webpage = self._download_webpage(webpage_url, video_id)
1955
1956         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1957
1958         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1959         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1960
1961         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1962             first_xml, u'node_id')
1963
1964         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1965         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1966
1967         self.report_extraction(video_id)
1968
1969         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1970         if mobj is None:
1971             raise ExtractorError(u'Unable to extract video url')
1972         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1973
1974         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1975             webpage, u'video title')
1976
1977         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1978             webpage, u'description', fatal=False)
1979
1980         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1981             webpage, u'thumbnail', fatal=False)
1982
1983         return [{
1984             'id':          video_id,
1985             'url':         video_url,
1986             'ext':         'mp4',
1987             'title':       video_title,
1988             'description': video_description,
1989             'thumbnail':   thumbnail,
1990             'uploader_id': video_uploader_id,
1991         }]
1992
1993 class TeamcocoIE(InfoExtractor):
1994     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1995
1996     def _real_extract(self, url):
1997         mobj = re.match(self._VALID_URL, url)
1998         if mobj is None:
1999             raise ExtractorError(u'Invalid URL: %s' % url)
2000         url_title = mobj.group('url_title')
2001         webpage = self._download_webpage(url, url_title)
2002
2003         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2004             webpage, u'video id')
2005
2006         self.report_extraction(video_id)
2007
2008         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2009             webpage, u'title')
2010
2011         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2012             webpage, u'thumbnail', fatal=False)
2013
2014         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2015             webpage, u'description', fatal=False)
2016
2017         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2018         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2019
2020         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2021             data, u'video URL')
2022
2023         return [{
2024             'id':          video_id,
2025             'url':         video_url,
2026             'ext':         'mp4',
2027             'title':       video_title,
2028             'thumbnail':   thumbnail,
2029             'description': video_description,
2030         }]
2031
2032 class XHamsterIE(InfoExtractor):
2033     """Information Extractor for xHamster"""
2034     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2035
2036     def _real_extract(self,url):
2037         mobj = re.match(self._VALID_URL, url)
2038
2039         video_id = mobj.group('id')
2040         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2041         webpage = self._download_webpage(mrss_url, video_id)
2042
2043         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2044         if mobj is None:
2045             raise ExtractorError(u'Unable to extract media URL')
2046         if len(mobj.group('server')) == 0:
2047             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2048         else:
2049             video_url = mobj.group('server')+'/key='+mobj.group('file')
2050         video_extension = video_url.split('.')[-1]
2051
2052         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2053             webpage, u'title')
2054
2055         # Can't see the description anywhere in the UI
2056         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2057         #     webpage, u'description', fatal=False)
2058         # if video_description: video_description = unescapeHTML(video_description)
2059
2060         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2061         if mobj:
2062             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2063         else:
2064             video_upload_date = None
2065             self._downloader.report_warning(u'Unable to extract upload date')
2066
2067         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2068             webpage, u'uploader id', default=u'anonymous')
2069
2070         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2071             webpage, u'thumbnail', fatal=False)
2072
2073         return [{
2074             'id':       video_id,
2075             'url':      video_url,
2076             'ext':      video_extension,
2077             'title':    video_title,
2078             # 'description': video_description,
2079             'upload_date': video_upload_date,
2080             'uploader_id': video_uploader_id,
2081             'thumbnail': video_thumbnail
2082         }]
2083
2084 class HypemIE(InfoExtractor):
2085     """Information Extractor for hypem"""
2086     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2087
2088     def _real_extract(self, url):
2089         mobj = re.match(self._VALID_URL, url)
2090         if mobj is None:
2091             raise ExtractorError(u'Invalid URL: %s' % url)
2092         track_id = mobj.group(1)
2093
2094         data = { 'ax': 1, 'ts': time.time() }
2095         data_encoded = compat_urllib_parse.urlencode(data)
2096         complete_url = url + "?" + data_encoded
2097         request = compat_urllib_request.Request(complete_url)
2098         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2099         cookie = urlh.headers.get('Set-Cookie', '')
2100
2101         self.report_extraction(track_id)
2102
2103         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2104             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2105         try:
2106             track_list = json.loads(html_tracks)
2107             track = track_list[u'tracks'][0]
2108         except ValueError:
2109             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2110
2111         key = track[u"key"]
2112         track_id = track[u"id"]
2113         artist = track[u"artist"]
2114         title = track[u"song"]
2115
2116         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2117         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2118         request.add_header('cookie', cookie)
2119         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2120         try:
2121             song_data = json.loads(song_data_json)
2122         except ValueError:
2123             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2124         final_url = song_data[u"url"]
2125
2126         return [{
2127             'id':       track_id,
2128             'url':      final_url,
2129             'ext':      "mp3",
2130             'title':    title,
2131             'artist':   artist,
2132         }]
2133
2134 class Vbox7IE(InfoExtractor):
2135     """Information Extractor for Vbox7"""
2136     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2137
2138     def _real_extract(self,url):
2139         mobj = re.match(self._VALID_URL, url)
2140         if mobj is None:
2141             raise ExtractorError(u'Invalid URL: %s' % url)
2142         video_id = mobj.group(1)
2143
2144         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2145         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2146         redirect_url = urlh.geturl() + new_location
2147         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2148
2149         title = self._html_search_regex(r'<title>(.*)</title>',
2150             webpage, u'title').split('/')[0].strip()
2151
2152         ext = "flv"
2153         info_url = "http://vbox7.com/play/magare.do"
2154         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2155         info_request = compat_urllib_request.Request(info_url, data)
2156         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2157         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2158         if info_response is None:
2159             raise ExtractorError(u'Unable to extract the media url')
2160         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2161
2162         return [{
2163             'id':        video_id,
2164             'url':       final_url,
2165             'ext':       ext,
2166             'title':     title,
2167             'thumbnail': thumbnail_url,
2168         }]
2169
2170
2171 def gen_extractors():
2172     """ Return a list of an instance of every supported extractor.
2173     The order does matter; the first extractor matched is the one handling the URL.
2174     """
2175     return [
2176         YoutubePlaylistIE(),
2177         YoutubeChannelIE(),
2178         YoutubeUserIE(),
2179         YoutubeSearchIE(),
2180         YoutubeIE(),
2181         MetacafeIE(),
2182         DailymotionIE(),
2183         GoogleSearchIE(),
2184         PhotobucketIE(),
2185         YahooIE(),
2186         YahooSearchIE(),
2187         DepositFilesIE(),
2188         FacebookIE(),
2189         BlipTVIE(),
2190         BlipTVUserIE(),
2191         VimeoIE(),
2192         MyVideoIE(),
2193         ComedyCentralIE(),
2194         EscapistIE(),
2195         CollegeHumorIE(),
2196         XVideosIE(),
2197         SoundcloudSetIE(),
2198         SoundcloudIE(),
2199         InfoQIE(),
2200         MixcloudIE(),
2201         StanfordOpenClassroomIE(),
2202         MTVIE(),
2203         YoukuIE(),
2204         XNXXIE(),
2205         YouJizzIE(),
2206         PornotubeIE(),
2207         YouPornIE(),
2208         GooglePlusIE(),
2209         ArteTvIE(),
2210         NBAIE(),
2211         WorldStarHipHopIE(),
2212         JustinTVIE(),
2213         FunnyOrDieIE(),
2214         SteamIE(),
2215         UstreamIE(),
2216         RBMARadioIE(),
2217         EightTracksIE(),
2218         KeekIE(),
2219         TEDIE(),
2220         MySpassIE(),
2221         SpiegelIE(),
2222         LiveLeakIE(),
2223         ARDIE(),
2224         ZDFIE(),
2225         TumblrIE(),
2226         BandcampIE(),
2227         RedTubeIE(),
2228         InaIE(),
2229         HowcastIE(),
2230         VineIE(),
2231         FlickrIE(),
2232         TeamcocoIE(),
2233         XHamsterIE(),
2234         HypemIE(),
2235         Vbox7IE(),
2236         GametrailersIE(),
2237         StatigramIE(),
2238         GenericIE()
2239     ]
2240
2241 def get_info_extractor(ie_name):
2242     """Returns the info extractor class with the given ie_name"""
2243     return globals()[ie_name+'IE']