Move blip.tv extractors into their own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.dailymotion import DailymotionIE
25 from .extractor.gametrailers import GametrailersIE
26 from .extractor.generic import GenericIE
27 from .extractor.metacafe import MetacafeIE
28 from .extractor.statigram import StatigramIE
29 from .extractor.photobucket import PhotobucketIE
30 from .extractor.vimeo import VimeoIE
31 from .extractor.yahoo import YahooIE, YahooSearchIE
32 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
33 from .extractor.zdf import ZDFIE
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53 class DepositFilesIE(InfoExtractor):
54     """Information extractor for depositfiles.com"""
55
56     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
57
58     def _real_extract(self, url):
59         file_id = url.split('/')[-1]
60         # Rebuild url in english locale
61         url = 'http://depositfiles.com/en/files/' + file_id
62
63         # Retrieve file webpage with 'Free download' button pressed
64         free_download_indication = { 'gateway_result' : '1' }
65         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
66         try:
67             self.report_download_webpage(file_id)
68             webpage = compat_urllib_request.urlopen(request).read()
69         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
70             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
71
72         # Search for the real file URL
73         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
74         if (mobj is None) or (mobj.group(1) is None):
75             # Try to figure out reason of the error.
76             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
77             if (mobj is not None) and (mobj.group(1) is not None):
78                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
79                 raise ExtractorError(u'%s' % restriction_message)
80             else:
81                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
82
83         file_url = mobj.group(1)
84         file_extension = os.path.splitext(file_url)[1][1:]
85
86         # Search for file title
87         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
88
89         return [{
90             'id':       file_id.decode('utf-8'),
91             'url':      file_url.decode('utf-8'),
92             'uploader': None,
93             'upload_date':  None,
94             'title':    file_title,
95             'ext':      file_extension.decode('utf-8'),
96         }]
97
98
99 class FacebookIE(InfoExtractor):
100     """Information Extractor for Facebook"""
101
102     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
103     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
104     _NETRC_MACHINE = 'facebook'
105     IE_NAME = u'facebook'
106
107     def report_login(self):
108         """Report attempt to log in."""
109         self.to_screen(u'Logging in')
110
111     def _real_initialize(self):
112         if self._downloader is None:
113             return
114
115         useremail = None
116         password = None
117         downloader_params = self._downloader.params
118
119         # Attempt to use provided username and password or .netrc data
120         if downloader_params.get('username', None) is not None:
121             useremail = downloader_params['username']
122             password = downloader_params['password']
123         elif downloader_params.get('usenetrc', False):
124             try:
125                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
126                 if info is not None:
127                     useremail = info[0]
128                     password = info[2]
129                 else:
130                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
131             except (IOError, netrc.NetrcParseError) as err:
132                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
133                 return
134
135         if useremail is None:
136             return
137
138         # Log in
139         login_form = {
140             'email': useremail,
141             'pass': password,
142             'login': 'Log+In'
143             }
144         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
145         try:
146             self.report_login()
147             login_results = compat_urllib_request.urlopen(request).read()
148             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
149                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
150                 return
151         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
152             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
153             return
154
155     def _real_extract(self, url):
156         mobj = re.match(self._VALID_URL, url)
157         if mobj is None:
158             raise ExtractorError(u'Invalid URL: %s' % url)
159         video_id = mobj.group('ID')
160
161         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
162         webpage = self._download_webpage(url, video_id)
163
164         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
165         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
166         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
167         if not m:
168             raise ExtractorError(u'Cannot parse data')
169         data = dict(json.loads(m.group(1)))
170         params_raw = compat_urllib_parse.unquote(data['params'])
171         params = json.loads(params_raw)
172         video_data = params['video_data'][0]
173         video_url = video_data.get('hd_src')
174         if not video_url:
175             video_url = video_data['sd_src']
176         if not video_url:
177             raise ExtractorError(u'Cannot find video URL')
178         video_duration = int(video_data['video_duration'])
179         thumbnail = video_data['thumbnail_src']
180
181         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
182             webpage, u'title')
183
184         info = {
185             'id': video_id,
186             'title': video_title,
187             'url': video_url,
188             'ext': 'mp4',
189             'duration': video_duration,
190             'thumbnail': thumbnail,
191         }
192         return [info]
193
194
195
196
197 class MyVideoIE(InfoExtractor):
198     """Information Extractor for myvideo.de."""
199
200     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
201     IE_NAME = u'myvideo'
202
203     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
204     # Released into the Public Domain by Tristan Fischer on 2013-05-19
205     # https://github.com/rg3/youtube-dl/pull/842
206     def __rc4crypt(self,data, key):
207         x = 0
208         box = list(range(256))
209         for i in list(range(256)):
210             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
211             box[i], box[x] = box[x], box[i]
212         x = 0
213         y = 0
214         out = ''
215         for char in data:
216             x = (x + 1) % 256
217             y = (y + box[x]) % 256
218             box[x], box[y] = box[y], box[x]
219             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
220         return out
221
222     def __md5(self,s):
223         return hashlib.md5(s).hexdigest().encode()
224
225     def _real_extract(self,url):
226         mobj = re.match(self._VALID_URL, url)
227         if mobj is None:
228             raise ExtractorError(u'invalid URL: %s' % url)
229
230         video_id = mobj.group(1)
231
232         GK = (
233           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
234           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
235           b'TnpsbA0KTVRkbU1tSTRNdz09'
236         )
237
238         # Get video webpage
239         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
240         webpage = self._download_webpage(webpage_url, video_id)
241
242         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
243         if mobj is not None:
244             self.report_extraction(video_id)
245             video_url = mobj.group(1) + '.flv'
246
247             video_title = self._html_search_regex('<title>([^<]+)</title>',
248                 webpage, u'title')
249
250             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
251
252             return [{
253                 'id':       video_id,
254                 'url':      video_url,
255                 'uploader': None,
256                 'upload_date':  None,
257                 'title':    video_title,
258                 'ext':      u'flv',
259             }]
260
261         # try encxml
262         mobj = re.search('var flashvars={(.+?)}', webpage)
263         if mobj is None:
264             raise ExtractorError(u'Unable to extract video')
265
266         params = {}
267         encxml = ''
268         sec = mobj.group(1)
269         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
270             if not a == '_encxml':
271                 params[a] = b
272             else:
273                 encxml = compat_urllib_parse.unquote(b)
274         if not params.get('domain'):
275             params['domain'] = 'www.myvideo.de'
276         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
277         if 'flash_playertype=MTV' in xmldata_url:
278             self._downloader.report_warning(u'avoiding MTV player')
279             xmldata_url = (
280                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
281                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
282             ) % video_id
283
284         # get enc data
285         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
286         enc_data_b = binascii.unhexlify(enc_data)
287         sk = self.__md5(
288             base64.b64decode(base64.b64decode(GK)) +
289             self.__md5(
290                 str(video_id).encode('utf-8')
291             )
292         )
293         dec_data = self.__rc4crypt(enc_data_b, sk)
294
295         # extracting infos
296         self.report_extraction(video_id)
297
298         video_url = None
299         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
300         if mobj:
301             video_url = compat_urllib_parse.unquote(mobj.group(1))
302             if 'myvideo2flash' in video_url:
303                 self._downloader.report_warning(u'forcing RTMPT ...')
304                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
305
306         if not video_url:
307             # extract non rtmp videos
308             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
309             if mobj is None:
310                 raise ExtractorError(u'unable to extract url')
311             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
312
313         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
314         video_file = compat_urllib_parse.unquote(video_file)
315
316         if not video_file.endswith('f4m'):
317             ppath, prefix = video_file.split('.')
318             video_playpath = '%s:%s' % (prefix, ppath)
319             video_hls_playlist = ''
320         else:
321             video_playpath = ''
322             video_hls_playlist = (
323                 video_filepath + video_file
324             ).replace('.f4m', '.m3u8')
325
326         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
327         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
328
329         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
330             webpage, u'title')
331
332         return [{
333             'id':                 video_id,
334             'url':                video_url,
335             'tc_url':             video_url,
336             'uploader':           None,
337             'upload_date':        None,
338             'title':              video_title,
339             'ext':                u'flv',
340             'play_path':          video_playpath,
341             'video_file':         video_file,
342             'video_hls_playlist': video_hls_playlist,
343             'player_url':         video_swfobj,
344         }]
345
346
347 class ComedyCentralIE(InfoExtractor):
348     """Information extractor for The Daily Show and Colbert Report """
349
350     # urls can be abbreviations like :thedailyshow or :colbert
351     # urls for episodes like:
352     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
353     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
354     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
355     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
356                       |(https?://)?(www\.)?
357                           (?P<showname>thedailyshow|colbertnation)\.com/
358                          (full-episodes/(?P<episode>.*)|
359                           (?P<clip>
360                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
361                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
362                      $"""
363
364     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
365
366     _video_extensions = {
367         '3500': 'mp4',
368         '2200': 'mp4',
369         '1700': 'mp4',
370         '1200': 'mp4',
371         '750': 'mp4',
372         '400': 'mp4',
373     }
374     _video_dimensions = {
375         '3500': '1280x720',
376         '2200': '960x540',
377         '1700': '768x432',
378         '1200': '640x360',
379         '750': '512x288',
380         '400': '384x216',
381     }
382
383     @classmethod
384     def suitable(cls, url):
385         """Receives a URL and returns True if suitable for this IE."""
386         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
387
388     def _print_formats(self, formats):
389         print('Available formats:')
390         for x in formats:
391             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
392
393
394     def _real_extract(self, url):
395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
396         if mobj is None:
397             raise ExtractorError(u'Invalid URL: %s' % url)
398
399         if mobj.group('shortname'):
400             if mobj.group('shortname') in ('tds', 'thedailyshow'):
401                 url = u'http://www.thedailyshow.com/full-episodes/'
402             else:
403                 url = u'http://www.colbertnation.com/full-episodes/'
404             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
405             assert mobj is not None
406
407         if mobj.group('clip'):
408             if mobj.group('showname') == 'thedailyshow':
409                 epTitle = mobj.group('tdstitle')
410             else:
411                 epTitle = mobj.group('cntitle')
412             dlNewest = False
413         else:
414             dlNewest = not mobj.group('episode')
415             if dlNewest:
416                 epTitle = mobj.group('showname')
417             else:
418                 epTitle = mobj.group('episode')
419
420         self.report_extraction(epTitle)
421         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
422         if dlNewest:
423             url = htmlHandle.geturl()
424             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
425             if mobj is None:
426                 raise ExtractorError(u'Invalid redirected URL: ' + url)
427             if mobj.group('episode') == '':
428                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
429             epTitle = mobj.group('episode')
430
431         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
432
433         if len(mMovieParams) == 0:
434             # The Colbert Report embeds the information in a without
435             # a URL prefix; so extract the alternate reference
436             # and then add the URL prefix manually.
437
438             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
439             if len(altMovieParams) == 0:
440                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
441             else:
442                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
443
444         uri = mMovieParams[0][1]
445         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
446         indexXml = self._download_webpage(indexUrl, epTitle,
447                                           u'Downloading show index',
448                                           u'unable to download episode index')
449
450         results = []
451
452         idoc = xml.etree.ElementTree.fromstring(indexXml)
453         itemEls = idoc.findall('.//item')
454         for partNum,itemEl in enumerate(itemEls):
455             mediaId = itemEl.findall('./guid')[0].text
456             shortMediaId = mediaId.split(':')[-1]
457             showId = mediaId.split(':')[-2].replace('.com', '')
458             officialTitle = itemEl.findall('./title')[0].text
459             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
460
461             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
462                         compat_urllib_parse.urlencode({'uri': mediaId}))
463             configXml = self._download_webpage(configUrl, epTitle,
464                                                u'Downloading configuration for %s' % shortMediaId)
465
466             cdoc = xml.etree.ElementTree.fromstring(configXml)
467             turls = []
468             for rendition in cdoc.findall('.//rendition'):
469                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
470                 turls.append(finfo)
471
472             if len(turls) == 0:
473                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
474                 continue
475
476             if self._downloader.params.get('listformats', None):
477                 self._print_formats([i[0] for i in turls])
478                 return
479
480             # For now, just pick the highest bitrate
481             format,rtmp_video_url = turls[-1]
482
483             # Get the format arg from the arg stream
484             req_format = self._downloader.params.get('format', None)
485
486             # Select format if we can find one
487             for f,v in turls:
488                 if f == req_format:
489                     format, rtmp_video_url = f, v
490                     break
491
492             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
493             if not m:
494                 raise ExtractorError(u'Cannot transform RTMP url')
495             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
496             video_url = base + m.group('finalid')
497
498             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
499             info = {
500                 'id': shortMediaId,
501                 'url': video_url,
502                 'uploader': showId,
503                 'upload_date': officialDate,
504                 'title': effTitle,
505                 'ext': 'mp4',
506                 'format': format,
507                 'thumbnail': None,
508                 'description': officialTitle,
509             }
510             results.append(info)
511
512         return results
513
514
515 class EscapistIE(InfoExtractor):
516     """Information extractor for The Escapist """
517
518     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
519     IE_NAME = u'escapist'
520
521     def _real_extract(self, url):
522         mobj = re.match(self._VALID_URL, url)
523         if mobj is None:
524             raise ExtractorError(u'Invalid URL: %s' % url)
525         showName = mobj.group('showname')
526         videoId = mobj.group('episode')
527
528         self.report_extraction(videoId)
529         webpage = self._download_webpage(url, videoId)
530
531         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
532             webpage, u'description', fatal=False)
533
534         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
535             webpage, u'thumbnail', fatal=False)
536
537         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
538             webpage, u'player url')
539
540         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
541             webpage, u'player url').split(' : ')[-1]
542
543         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
544         configUrl = compat_urllib_parse.unquote(configUrl)
545
546         configJSON = self._download_webpage(configUrl, videoId,
547                                             u'Downloading configuration',
548                                             u'unable to download configuration')
549
550         # Technically, it's JavaScript, not JSON
551         configJSON = configJSON.replace("'", '"')
552
553         try:
554             config = json.loads(configJSON)
555         except (ValueError,) as err:
556             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
557
558         playlist = config['playlist']
559         videoUrl = playlist[1]['url']
560
561         info = {
562             'id': videoId,
563             'url': videoUrl,
564             'uploader': showName,
565             'upload_date': None,
566             'title': title,
567             'ext': 'mp4',
568             'thumbnail': imgUrl,
569             'description': videoDesc,
570             'player_url': playerUrl,
571         }
572
573         return [info]
574
575 class CollegeHumorIE(InfoExtractor):
576     """Information extractor for collegehumor.com"""
577
578     _WORKING = False
579     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
580     IE_NAME = u'collegehumor'
581
582     def report_manifest(self, video_id):
583         """Report information extraction."""
584         self.to_screen(u'%s: Downloading XML manifest' % video_id)
585
586     def _real_extract(self, url):
587         mobj = re.match(self._VALID_URL, url)
588         if mobj is None:
589             raise ExtractorError(u'Invalid URL: %s' % url)
590         video_id = mobj.group('videoid')
591
592         info = {
593             'id': video_id,
594             'uploader': None,
595             'upload_date': None,
596         }
597
598         self.report_extraction(video_id)
599         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
600         try:
601             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
602         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
603             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
604
605         mdoc = xml.etree.ElementTree.fromstring(metaXml)
606         try:
607             videoNode = mdoc.findall('./video')[0]
608             info['description'] = videoNode.findall('./description')[0].text
609             info['title'] = videoNode.findall('./caption')[0].text
610             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
611             manifest_url = videoNode.findall('./file')[0].text
612         except IndexError:
613             raise ExtractorError(u'Invalid metadata XML file')
614
615         manifest_url += '?hdcore=2.10.3'
616         self.report_manifest(video_id)
617         try:
618             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
621
622         adoc = xml.etree.ElementTree.fromstring(manifestXml)
623         try:
624             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
625             node_id = media_node.attrib['url']
626             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
627         except IndexError as err:
628             raise ExtractorError(u'Invalid manifest file')
629
630         url_pr = compat_urllib_parse_urlparse(manifest_url)
631         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
632
633         info['url'] = url
634         info['ext'] = 'f4f'
635         return [info]
636
637
638 class XVideosIE(InfoExtractor):
639     """Information extractor for xvideos.com"""
640
641     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
642     IE_NAME = u'xvideos'
643
644     def _real_extract(self, url):
645         mobj = re.match(self._VALID_URL, url)
646         if mobj is None:
647             raise ExtractorError(u'Invalid URL: %s' % url)
648         video_id = mobj.group(1)
649
650         webpage = self._download_webpage(url, video_id)
651
652         self.report_extraction(video_id)
653
654         # Extract video URL
655         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
656             webpage, u'video URL'))
657
658         # Extract title
659         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
660             webpage, u'title')
661
662         # Extract video thumbnail
663         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
664             webpage, u'thumbnail', fatal=False)
665
666         info = {
667             'id': video_id,
668             'url': video_url,
669             'uploader': None,
670             'upload_date': None,
671             'title': video_title,
672             'ext': 'flv',
673             'thumbnail': video_thumbnail,
674             'description': None,
675         }
676
677         return [info]
678
679
680 class SoundcloudIE(InfoExtractor):
681     """Information extractor for soundcloud.com
682        To access the media, the uid of the song and a stream token
683        must be extracted from the page source and the script must make
684        a request to media.soundcloud.com/crossdomain.xml. Then
685        the media can be grabbed by requesting from an url composed
686        of the stream token and uid
687      """
688
689     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
690     IE_NAME = u'soundcloud'
691
692     def report_resolve(self, video_id):
693         """Report information extraction."""
694         self.to_screen(u'%s: Resolving id' % video_id)
695
696     def _real_extract(self, url):
697         mobj = re.match(self._VALID_URL, url)
698         if mobj is None:
699             raise ExtractorError(u'Invalid URL: %s' % url)
700
701         # extract uploader (which is in the url)
702         uploader = mobj.group(1)
703         # extract simple title (uploader + slug of song title)
704         slug_title =  mobj.group(2)
705         simple_title = uploader + u'-' + slug_title
706         full_title = '%s/%s' % (uploader, slug_title)
707
708         self.report_resolve(full_title)
709
710         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
711         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
712         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
713
714         info = json.loads(info_json)
715         video_id = info['id']
716         self.report_extraction(full_title)
717
718         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
719         stream_json = self._download_webpage(streams_url, full_title,
720                                              u'Downloading stream definitions',
721                                              u'unable to download stream definitions')
722
723         streams = json.loads(stream_json)
724         mediaURL = streams['http_mp3_128_url']
725         upload_date = unified_strdate(info['created_at'])
726
727         return [{
728             'id':       info['id'],
729             'url':      mediaURL,
730             'uploader': info['user']['username'],
731             'upload_date': upload_date,
732             'title':    info['title'],
733             'ext':      u'mp3',
734             'description': info['description'],
735         }]
736
737 class SoundcloudSetIE(InfoExtractor):
738     """Information extractor for soundcloud.com sets
739        To access the media, the uid of the song and a stream token
740        must be extracted from the page source and the script must make
741        a request to media.soundcloud.com/crossdomain.xml. Then
742        the media can be grabbed by requesting from an url composed
743        of the stream token and uid
744      """
745
746     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
747     IE_NAME = u'soundcloud:set'
748
749     def report_resolve(self, video_id):
750         """Report information extraction."""
751         self.to_screen(u'%s: Resolving id' % video_id)
752
753     def _real_extract(self, url):
754         mobj = re.match(self._VALID_URL, url)
755         if mobj is None:
756             raise ExtractorError(u'Invalid URL: %s' % url)
757
758         # extract uploader (which is in the url)
759         uploader = mobj.group(1)
760         # extract simple title (uploader + slug of song title)
761         slug_title =  mobj.group(2)
762         simple_title = uploader + u'-' + slug_title
763         full_title = '%s/sets/%s' % (uploader, slug_title)
764
765         self.report_resolve(full_title)
766
767         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
768         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
769         info_json = self._download_webpage(resolv_url, full_title)
770
771         videos = []
772         info = json.loads(info_json)
773         if 'errors' in info:
774             for err in info['errors']:
775                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
776             return
777
778         self.report_extraction(full_title)
779         for track in info['tracks']:
780             video_id = track['id']
781
782             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
783             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
784
785             self.report_extraction(video_id)
786             streams = json.loads(stream_json)
787             mediaURL = streams['http_mp3_128_url']
788
789             videos.append({
790                 'id':       video_id,
791                 'url':      mediaURL,
792                 'uploader': track['user']['username'],
793                 'upload_date':  unified_strdate(track['created_at']),
794                 'title':    track['title'],
795                 'ext':      u'mp3',
796                 'description': track['description'],
797             })
798         return videos
799
800
801 class InfoQIE(InfoExtractor):
802     """Information extractor for infoq.com"""
803     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
804
805     def _real_extract(self, url):
806         mobj = re.match(self._VALID_URL, url)
807         if mobj is None:
808             raise ExtractorError(u'Invalid URL: %s' % url)
809
810         webpage = self._download_webpage(url, video_id=url)
811         self.report_extraction(url)
812
813         # Extract video URL
814         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
815         if mobj is None:
816             raise ExtractorError(u'Unable to extract video url')
817         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
818         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
819
820         # Extract title
821         video_title = self._search_regex(r'contentTitle = "(.*?)";',
822             webpage, u'title')
823
824         # Extract description
825         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
826             webpage, u'description', fatal=False)
827
828         video_filename = video_url.split('/')[-1]
829         video_id, extension = video_filename.split('.')
830
831         info = {
832             'id': video_id,
833             'url': video_url,
834             'uploader': None,
835             'upload_date': None,
836             'title': video_title,
837             'ext': extension, # Extension is always(?) mp4, but seems to be flv
838             'thumbnail': None,
839             'description': video_description,
840         }
841
842         return [info]
843
844 class MixcloudIE(InfoExtractor):
845     """Information extractor for www.mixcloud.com"""
846
847     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
848     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
849     IE_NAME = u'mixcloud'
850
851     def report_download_json(self, file_id):
852         """Report JSON download."""
853         self.to_screen(u'Downloading json')
854
855     def get_urls(self, jsonData, fmt, bitrate='best'):
856         """Get urls from 'audio_formats' section in json"""
857         file_url = None
858         try:
859             bitrate_list = jsonData[fmt]
860             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
861                 bitrate = max(bitrate_list) # select highest
862
863             url_list = jsonData[fmt][bitrate]
864         except TypeError: # we have no bitrate info.
865             url_list = jsonData[fmt]
866         return url_list
867
868     def check_urls(self, url_list):
869         """Returns 1st active url from list"""
870         for url in url_list:
871             try:
872                 compat_urllib_request.urlopen(url)
873                 return url
874             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
875                 url = None
876
877         return None
878
879     def _print_formats(self, formats):
880         print('Available formats:')
881         for fmt in formats.keys():
882             for b in formats[fmt]:
883                 try:
884                     ext = formats[fmt][b][0]
885                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
886                 except TypeError: # we have no bitrate info
887                     ext = formats[fmt][0]
888                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
889                     break
890
891     def _real_extract(self, url):
892         mobj = re.match(self._VALID_URL, url)
893         if mobj is None:
894             raise ExtractorError(u'Invalid URL: %s' % url)
895         # extract uploader & filename from url
896         uploader = mobj.group(1).decode('utf-8')
897         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
898
899         # construct API request
900         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
901         # retrieve .json file with links to files
902         request = compat_urllib_request.Request(file_url)
903         try:
904             self.report_download_json(file_url)
905             jsonData = compat_urllib_request.urlopen(request).read()
906         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
907             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
908
909         # parse JSON
910         json_data = json.loads(jsonData)
911         player_url = json_data['player_swf_url']
912         formats = dict(json_data['audio_formats'])
913
914         req_format = self._downloader.params.get('format', None)
915         bitrate = None
916
917         if self._downloader.params.get('listformats', None):
918             self._print_formats(formats)
919             return
920
921         if req_format is None or req_format == 'best':
922             for format_param in formats.keys():
923                 url_list = self.get_urls(formats, format_param)
924                 # check urls
925                 file_url = self.check_urls(url_list)
926                 if file_url is not None:
927                     break # got it!
928         else:
929             if req_format not in formats:
930                 raise ExtractorError(u'Format is not available')
931
932             url_list = self.get_urls(formats, req_format)
933             file_url = self.check_urls(url_list)
934             format_param = req_format
935
936         return [{
937             'id': file_id.decode('utf-8'),
938             'url': file_url.decode('utf-8'),
939             'uploader': uploader.decode('utf-8'),
940             'upload_date': None,
941             'title': json_data['name'],
942             'ext': file_url.split('.')[-1].decode('utf-8'),
943             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
944             'thumbnail': json_data['thumbnail_url'],
945             'description': json_data['description'],
946             'player_url': player_url.decode('utf-8'),
947         }]
948
949 class StanfordOpenClassroomIE(InfoExtractor):
950     """Information extractor for Stanford's Open ClassRoom"""
951
952     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
953     IE_NAME = u'stanfordoc'
954
955     def _real_extract(self, url):
956         mobj = re.match(self._VALID_URL, url)
957         if mobj is None:
958             raise ExtractorError(u'Invalid URL: %s' % url)
959
960         if mobj.group('course') and mobj.group('video'): # A specific video
961             course = mobj.group('course')
962             video = mobj.group('video')
963             info = {
964                 'id': course + '_' + video,
965                 'uploader': None,
966                 'upload_date': None,
967             }
968
969             self.report_extraction(info['id'])
970             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
971             xmlUrl = baseUrl + video + '.xml'
972             try:
973                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
974             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
976             mdoc = xml.etree.ElementTree.fromstring(metaXml)
977             try:
978                 info['title'] = mdoc.findall('./title')[0].text
979                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
980             except IndexError:
981                 raise ExtractorError(u'Invalid metadata XML file')
982             info['ext'] = info['url'].rpartition('.')[2]
983             return [info]
984         elif mobj.group('course'): # A course page
985             course = mobj.group('course')
986             info = {
987                 'id': course,
988                 'type': 'playlist',
989                 'uploader': None,
990                 'upload_date': None,
991             }
992
993             coursepage = self._download_webpage(url, info['id'],
994                                         note='Downloading course info page',
995                                         errnote='Unable to download course info page')
996
997             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
998
999             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1000                 coursepage, u'description', fatal=False)
1001
1002             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1003             info['list'] = [
1004                 {
1005                     'type': 'reference',
1006                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1007                 }
1008                     for vpage in links]
1009             results = []
1010             for entry in info['list']:
1011                 assert entry['type'] == 'reference'
1012                 results += self.extract(entry['url'])
1013             return results
1014         else: # Root page
1015             info = {
1016                 'id': 'Stanford OpenClassroom',
1017                 'type': 'playlist',
1018                 'uploader': None,
1019                 'upload_date': None,
1020             }
1021
1022             self.report_download_webpage(info['id'])
1023             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1024             try:
1025                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1026             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1028
1029             info['title'] = info['id']
1030
1031             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1032             info['list'] = [
1033                 {
1034                     'type': 'reference',
1035                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1036                 }
1037                     for cpage in links]
1038
1039             results = []
1040             for entry in info['list']:
1041                 assert entry['type'] == 'reference'
1042                 results += self.extract(entry['url'])
1043             return results
1044
1045 class MTVIE(InfoExtractor):
1046     """Information extractor for MTV.com"""
1047
1048     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1049     IE_NAME = u'mtv'
1050
1051     def _real_extract(self, url):
1052         mobj = re.match(self._VALID_URL, url)
1053         if mobj is None:
1054             raise ExtractorError(u'Invalid URL: %s' % url)
1055         if not mobj.group('proto'):
1056             url = 'http://' + url
1057         video_id = mobj.group('videoid')
1058
1059         webpage = self._download_webpage(url, video_id)
1060
1061         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1062             webpage, u'song name', fatal=False)
1063
1064         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1065             webpage, u'title')
1066
1067         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1068             webpage, u'mtvn_uri', fatal=False)
1069
1070         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1071             webpage, u'content id', fatal=False)
1072
1073         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1074         self.report_extraction(video_id)
1075         request = compat_urllib_request.Request(videogen_url)
1076         try:
1077             metadataXml = compat_urllib_request.urlopen(request).read()
1078         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1079             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1080
1081         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1082         renditions = mdoc.findall('.//rendition')
1083
1084         # For now, always pick the highest quality.
1085         rendition = renditions[-1]
1086
1087         try:
1088             _,_,ext = rendition.attrib['type'].partition('/')
1089             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1090             video_url = rendition.find('./src').text
1091         except KeyError:
1092             raise ExtractorError('Invalid rendition field.')
1093
1094         info = {
1095             'id': video_id,
1096             'url': video_url,
1097             'uploader': performer,
1098             'upload_date': None,
1099             'title': video_title,
1100             'ext': ext,
1101             'format': format,
1102         }
1103
1104         return [info]
1105
1106
1107 class YoukuIE(InfoExtractor):
1108     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1109
1110     def _gen_sid(self):
1111         nowTime = int(time.time() * 1000)
1112         random1 = random.randint(1000,1998)
1113         random2 = random.randint(1000,9999)
1114
1115         return "%d%d%d" %(nowTime,random1,random2)
1116
1117     def _get_file_ID_mix_string(self, seed):
1118         mixed = []
1119         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1120         seed = float(seed)
1121         for i in range(len(source)):
1122             seed  =  (seed * 211 + 30031 ) % 65536
1123             index  =  math.floor(seed / 65536 * len(source) )
1124             mixed.append(source[int(index)])
1125             source.remove(source[int(index)])
1126         #return ''.join(mixed)
1127         return mixed
1128
1129     def _get_file_id(self, fileId, seed):
1130         mixed = self._get_file_ID_mix_string(seed)
1131         ids = fileId.split('*')
1132         realId = []
1133         for ch in ids:
1134             if ch:
1135                 realId.append(mixed[int(ch)])
1136         return ''.join(realId)
1137
1138     def _real_extract(self, url):
1139         mobj = re.match(self._VALID_URL, url)
1140         if mobj is None:
1141             raise ExtractorError(u'Invalid URL: %s' % url)
1142         video_id = mobj.group('ID')
1143
1144         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1145
1146         jsondata = self._download_webpage(info_url, video_id)
1147
1148         self.report_extraction(video_id)
1149         try:
1150             config = json.loads(jsondata)
1151
1152             video_title =  config['data'][0]['title']
1153             seed = config['data'][0]['seed']
1154
1155             format = self._downloader.params.get('format', None)
1156             supported_format = list(config['data'][0]['streamfileids'].keys())
1157
1158             if format is None or format == 'best':
1159                 if 'hd2' in supported_format:
1160                     format = 'hd2'
1161                 else:
1162                     format = 'flv'
1163                 ext = u'flv'
1164             elif format == 'worst':
1165                 format = 'mp4'
1166                 ext = u'mp4'
1167             else:
1168                 format = 'flv'
1169                 ext = u'flv'
1170
1171
1172             fileid = config['data'][0]['streamfileids'][format]
1173             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1174         except (UnicodeDecodeError, ValueError, KeyError):
1175             raise ExtractorError(u'Unable to extract info section')
1176
1177         files_info=[]
1178         sid = self._gen_sid()
1179         fileid = self._get_file_id(fileid, seed)
1180
1181         #column 8,9 of fileid represent the segment number
1182         #fileid[7:9] should be changed
1183         for index, key in enumerate(keys):
1184
1185             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1186             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1187
1188             info = {
1189                 'id': '%s_part%02d' % (video_id, index),
1190                 'url': download_url,
1191                 'uploader': None,
1192                 'upload_date': None,
1193                 'title': video_title,
1194                 'ext': ext,
1195             }
1196             files_info.append(info)
1197
1198         return files_info
1199
1200
1201 class XNXXIE(InfoExtractor):
1202     """Information extractor for xnxx.com"""
1203
1204     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1205     IE_NAME = u'xnxx'
1206     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1207     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1208     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1209
1210     def _real_extract(self, url):
1211         mobj = re.match(self._VALID_URL, url)
1212         if mobj is None:
1213             raise ExtractorError(u'Invalid URL: %s' % url)
1214         video_id = mobj.group(1)
1215
1216         # Get webpage content
1217         webpage = self._download_webpage(url, video_id)
1218
1219         video_url = self._search_regex(self.VIDEO_URL_RE,
1220             webpage, u'video URL')
1221         video_url = compat_urllib_parse.unquote(video_url)
1222
1223         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1224             webpage, u'title')
1225
1226         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1227             webpage, u'thumbnail', fatal=False)
1228
1229         return [{
1230             'id': video_id,
1231             'url': video_url,
1232             'uploader': None,
1233             'upload_date': None,
1234             'title': video_title,
1235             'ext': 'flv',
1236             'thumbnail': video_thumbnail,
1237             'description': None,
1238         }]
1239
1240
1241 class GooglePlusIE(InfoExtractor):
1242     """Information extractor for plus.google.com."""
1243
1244     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1245     IE_NAME = u'plus.google'
1246
1247     def _real_extract(self, url):
1248         # Extract id from URL
1249         mobj = re.match(self._VALID_URL, url)
1250         if mobj is None:
1251             raise ExtractorError(u'Invalid URL: %s' % url)
1252
1253         post_url = mobj.group(0)
1254         video_id = mobj.group(1)
1255
1256         video_extension = 'flv'
1257
1258         # Step 1, Retrieve post webpage to extract further information
1259         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1260
1261         self.report_extraction(video_id)
1262
1263         # Extract update date
1264         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1265             webpage, u'upload date', fatal=False)
1266         if upload_date:
1267             # Convert timestring to a format suitable for filename
1268             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1269             upload_date = upload_date.strftime('%Y%m%d')
1270
1271         # Extract uploader
1272         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1273             webpage, u'uploader', fatal=False)
1274
1275         # Extract title
1276         # Get the first line for title
1277         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1278             webpage, 'title', default=u'NA')
1279
1280         # Step 2, Stimulate clicking the image box to launch video
1281         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1282             webpage, u'video page URL')
1283         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1284
1285         # Extract video links on video page
1286         """Extract video links of all sizes"""
1287         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1288         mobj = re.findall(pattern, webpage)
1289         if len(mobj) == 0:
1290             raise ExtractorError(u'Unable to extract video links')
1291
1292         # Sort in resolution
1293         links = sorted(mobj)
1294
1295         # Choose the lowest of the sort, i.e. highest resolution
1296         video_url = links[-1]
1297         # Only get the url. The resolution part in the tuple has no use anymore
1298         video_url = video_url[-1]
1299         # Treat escaped \u0026 style hex
1300         try:
1301             video_url = video_url.decode("unicode_escape")
1302         except AttributeError: # Python 3
1303             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1304
1305
1306         return [{
1307             'id':       video_id,
1308             'url':      video_url,
1309             'uploader': uploader,
1310             'upload_date':  upload_date,
1311             'title':    video_title,
1312             'ext':      video_extension,
1313         }]
1314
1315 class NBAIE(InfoExtractor):
1316     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1317     IE_NAME = u'nba'
1318
1319     def _real_extract(self, url):
1320         mobj = re.match(self._VALID_URL, url)
1321         if mobj is None:
1322             raise ExtractorError(u'Invalid URL: %s' % url)
1323
1324         video_id = mobj.group(1)
1325
1326         webpage = self._download_webpage(url, video_id)
1327
1328         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1329
1330         shortened_video_id = video_id.rpartition('/')[2]
1331         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1332             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1333
1334         # It isn't there in the HTML it returns to us
1335         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1336
1337         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1338
1339         info = {
1340             'id': shortened_video_id,
1341             'url': video_url,
1342             'ext': 'mp4',
1343             'title': title,
1344             # 'uploader_date': uploader_date,
1345             'description': description,
1346         }
1347         return [info]
1348
1349 class JustinTVIE(InfoExtractor):
1350     """Information extractor for justin.tv and twitch.tv"""
1351     # TODO: One broadcast may be split into multiple videos. The key
1352     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1353     # starts at 1 and increases. Can we treat all parts as one video?
1354
1355     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1356         (?:
1357             (?P<channelid>[^/]+)|
1358             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1359             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1360         )
1361         /?(?:\#.*)?$
1362         """
1363     _JUSTIN_PAGE_LIMIT = 100
1364     IE_NAME = u'justin.tv'
1365
1366     def report_download_page(self, channel, offset):
1367         """Report attempt to download a single page of videos."""
1368         self.to_screen(u'%s: Downloading video information from %d to %d' %
1369                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1370
1371     # Return count of items, list of *valid* items
1372     def _parse_page(self, url, video_id):
1373         webpage = self._download_webpage(url, video_id,
1374                                          u'Downloading video info JSON',
1375                                          u'unable to download video info JSON')
1376
1377         response = json.loads(webpage)
1378         if type(response) != list:
1379             error_text = response.get('error', 'unknown error')
1380             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1381         info = []
1382         for clip in response:
1383             video_url = clip['video_file_url']
1384             if video_url:
1385                 video_extension = os.path.splitext(video_url)[1][1:]
1386                 video_date = re.sub('-', '', clip['start_time'][:10])
1387                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1388                 video_id = clip['id']
1389                 video_title = clip.get('title', video_id)
1390                 info.append({
1391                     'id': video_id,
1392                     'url': video_url,
1393                     'title': video_title,
1394                     'uploader': clip.get('channel_name', video_uploader_id),
1395                     'uploader_id': video_uploader_id,
1396                     'upload_date': video_date,
1397                     'ext': video_extension,
1398                 })
1399         return (len(response), info)
1400
1401     def _real_extract(self, url):
1402         mobj = re.match(self._VALID_URL, url)
1403         if mobj is None:
1404             raise ExtractorError(u'invalid URL: %s' % url)
1405
1406         api_base = 'http://api.justin.tv'
1407         paged = False
1408         if mobj.group('channelid'):
1409             paged = True
1410             video_id = mobj.group('channelid')
1411             api = api_base + '/channel/archives/%s.json' % video_id
1412         elif mobj.group('chapterid'):
1413             chapter_id = mobj.group('chapterid')
1414
1415             webpage = self._download_webpage(url, chapter_id)
1416             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1417             if not m:
1418                 raise ExtractorError(u'Cannot find archive of a chapter')
1419             archive_id = m.group(1)
1420
1421             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1422             chapter_info_xml = self._download_webpage(api, chapter_id,
1423                                              note=u'Downloading chapter information',
1424                                              errnote=u'Chapter information download failed')
1425             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1426             for a in doc.findall('.//archive'):
1427                 if archive_id == a.find('./id').text:
1428                     break
1429             else:
1430                 raise ExtractorError(u'Could not find chapter in chapter information')
1431
1432             video_url = a.find('./video_file_url').text
1433             video_ext = video_url.rpartition('.')[2] or u'flv'
1434
1435             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1436             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1437                                    note='Downloading chapter metadata',
1438                                    errnote='Download of chapter metadata failed')
1439             chapter_info = json.loads(chapter_info_json)
1440
1441             bracket_start = int(doc.find('.//bracket_start').text)
1442             bracket_end = int(doc.find('.//bracket_end').text)
1443
1444             # TODO determine start (and probably fix up file)
1445             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1446             #video_url += u'?start=' + TODO:start_timestamp
1447             # bracket_start is 13290, but we want 51670615
1448             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1449                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1450
1451             info = {
1452                 'id': u'c' + chapter_id,
1453                 'url': video_url,
1454                 'ext': video_ext,
1455                 'title': chapter_info['title'],
1456                 'thumbnail': chapter_info['preview'],
1457                 'description': chapter_info['description'],
1458                 'uploader': chapter_info['channel']['display_name'],
1459                 'uploader_id': chapter_info['channel']['name'],
1460             }
1461             return [info]
1462         else:
1463             video_id = mobj.group('videoid')
1464             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1465
1466         self.report_extraction(video_id)
1467
1468         info = []
1469         offset = 0
1470         limit = self._JUSTIN_PAGE_LIMIT
1471         while True:
1472             if paged:
1473                 self.report_download_page(video_id, offset)
1474             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1475             page_count, page_info = self._parse_page(page_url, video_id)
1476             info.extend(page_info)
1477             if not paged or page_count != limit:
1478                 break
1479             offset += limit
1480         return info
1481
1482 class FunnyOrDieIE(InfoExtractor):
1483     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1484
1485     def _real_extract(self, url):
1486         mobj = re.match(self._VALID_URL, url)
1487         if mobj is None:
1488             raise ExtractorError(u'invalid URL: %s' % url)
1489
1490         video_id = mobj.group('id')
1491         webpage = self._download_webpage(url, video_id)
1492
1493         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1494             webpage, u'video URL', flags=re.DOTALL)
1495
1496         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1497             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1498
1499         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1500             webpage, u'description', fatal=False, flags=re.DOTALL)
1501
1502         info = {
1503             'id': video_id,
1504             'url': video_url,
1505             'ext': 'mp4',
1506             'title': title,
1507             'description': video_description,
1508         }
1509         return [info]
1510
1511 class SteamIE(InfoExtractor):
1512     _VALID_URL = r"""http://store\.steampowered\.com/
1513                 (agecheck/)?
1514                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1515                 (?P<gameID>\d+)/?
1516                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1517                 """
1518     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1519     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1520
1521     @classmethod
1522     def suitable(cls, url):
1523         """Receives a URL and returns True if suitable for this IE."""
1524         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1525
1526     def _real_extract(self, url):
1527         m = re.match(self._VALID_URL, url, re.VERBOSE)
1528         gameID = m.group('gameID')
1529
1530         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1531         webpage = self._download_webpage(videourl, gameID)
1532
1533         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1534             videourl = self._AGECHECK_TEMPLATE % gameID
1535             self.report_age_confirmation()
1536             webpage = self._download_webpage(videourl, gameID)
1537
1538         self.report_extraction(gameID)
1539         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1540                                              webpage, 'game title')
1541
1542         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1543         mweb = re.finditer(urlRE, webpage)
1544         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1545         titles = re.finditer(namesRE, webpage)
1546         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1547         thumbs = re.finditer(thumbsRE, webpage)
1548         videos = []
1549         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1550             video_id = vid.group('videoID')
1551             title = vtitle.group('videoName')
1552             video_url = vid.group('videoURL')
1553             video_thumb = thumb.group('thumbnail')
1554             if not video_url:
1555                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1556             info = {
1557                 'id':video_id,
1558                 'url':video_url,
1559                 'ext': 'flv',
1560                 'title': unescapeHTML(title),
1561                 'thumbnail': video_thumb
1562                   }
1563             videos.append(info)
1564         return [self.playlist_result(videos, gameID, game_title)]
1565
1566 class UstreamIE(InfoExtractor):
1567     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1568     IE_NAME = u'ustream'
1569
1570     def _real_extract(self, url):
1571         m = re.match(self._VALID_URL, url)
1572         video_id = m.group('videoID')
1573
1574         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1575         webpage = self._download_webpage(url, video_id)
1576
1577         self.report_extraction(video_id)
1578
1579         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1580             webpage, u'title')
1581
1582         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1583             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1584
1585         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1586             webpage, u'thumbnail', fatal=False)
1587
1588         info = {
1589                 'id': video_id,
1590                 'url': video_url,
1591                 'ext': 'flv',
1592                 'title': video_title,
1593                 'uploader': uploader,
1594                 'thumbnail': thumbnail,
1595                }
1596         return info
1597
1598 class WorldStarHipHopIE(InfoExtractor):
1599     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1600     IE_NAME = u'WorldStarHipHop'
1601
1602     def _real_extract(self, url):
1603         m = re.match(self._VALID_URL, url)
1604         video_id = m.group('id')
1605
1606         webpage_src = self._download_webpage(url, video_id)
1607
1608         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1609             webpage_src, u'video URL')
1610
1611         if 'mp4' in video_url:
1612             ext = 'mp4'
1613         else:
1614             ext = 'flv'
1615
1616         video_title = self._html_search_regex(r"<title>(.*)</title>",
1617             webpage_src, u'title')
1618
1619         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1620         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1621             webpage_src, u'thumbnail', fatal=False)
1622
1623         if not thumbnail:
1624             _title = r"""candytitles.*>(.*)</span>"""
1625             mobj = re.search(_title, webpage_src)
1626             if mobj is not None:
1627                 video_title = mobj.group(1)
1628
1629         results = [{
1630                     'id': video_id,
1631                     'url' : video_url,
1632                     'title' : video_title,
1633                     'thumbnail' : thumbnail,
1634                     'ext' : ext,
1635                     }]
1636         return results
1637
1638 class RBMARadioIE(InfoExtractor):
1639     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1640
1641     def _real_extract(self, url):
1642         m = re.match(self._VALID_URL, url)
1643         video_id = m.group('videoID')
1644
1645         webpage = self._download_webpage(url, video_id)
1646
1647         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1648             webpage, u'json data', flags=re.MULTILINE)
1649
1650         try:
1651             data = json.loads(json_data)
1652         except ValueError as e:
1653             raise ExtractorError(u'Invalid JSON: ' + str(e))
1654
1655         video_url = data['akamai_url'] + '&cbr=256'
1656         url_parts = compat_urllib_parse_urlparse(video_url)
1657         video_ext = url_parts.path.rpartition('.')[2]
1658         info = {
1659                 'id': video_id,
1660                 'url': video_url,
1661                 'ext': video_ext,
1662                 'title': data['title'],
1663                 'description': data.get('teaser_text'),
1664                 'location': data.get('country_of_origin'),
1665                 'uploader': data.get('host', {}).get('name'),
1666                 'uploader_id': data.get('host', {}).get('slug'),
1667                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1668                 'duration': data.get('duration'),
1669         }
1670         return [info]
1671
1672
1673 class YouPornIE(InfoExtractor):
1674     """Information extractor for youporn.com."""
1675     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1676
1677     def _print_formats(self, formats):
1678         """Print all available formats"""
1679         print(u'Available formats:')
1680         print(u'ext\t\tformat')
1681         print(u'---------------------------------')
1682         for format in formats:
1683             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1684
1685     def _specific(self, req_format, formats):
1686         for x in formats:
1687             if(x["format"]==req_format):
1688                 return x
1689         return None
1690
1691     def _real_extract(self, url):
1692         mobj = re.match(self._VALID_URL, url)
1693         if mobj is None:
1694             raise ExtractorError(u'Invalid URL: %s' % url)
1695         video_id = mobj.group('videoid')
1696
1697         req = compat_urllib_request.Request(url)
1698         req.add_header('Cookie', 'age_verified=1')
1699         webpage = self._download_webpage(req, video_id)
1700
1701         # Get JSON parameters
1702         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1703         try:
1704             params = json.loads(json_params)
1705         except:
1706             raise ExtractorError(u'Invalid JSON')
1707
1708         self.report_extraction(video_id)
1709         try:
1710             video_title = params['title']
1711             upload_date = unified_strdate(params['release_date_f'])
1712             video_description = params['description']
1713             video_uploader = params['submitted_by']
1714             thumbnail = params['thumbnails'][0]['image']
1715         except KeyError:
1716             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1717
1718         # Get all of the formats available
1719         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1720         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1721             webpage, u'download list').strip()
1722
1723         # Get all of the links from the page
1724         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1725         links = re.findall(LINK_RE, download_list_html)
1726         if(len(links) == 0):
1727             raise ExtractorError(u'ERROR: no known formats available for video')
1728
1729         self.to_screen(u'Links found: %d' % len(links))
1730
1731         formats = []
1732         for link in links:
1733
1734             # A link looks like this:
1735             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1736             # A path looks like this:
1737             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1738             video_url = unescapeHTML( link )
1739             path = compat_urllib_parse_urlparse( video_url ).path
1740             extension = os.path.splitext( path )[1][1:]
1741             format = path.split('/')[4].split('_')[:2]
1742             size = format[0]
1743             bitrate = format[1]
1744             format = "-".join( format )
1745             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1746
1747             formats.append({
1748                 'id': video_id,
1749                 'url': video_url,
1750                 'uploader': video_uploader,
1751                 'upload_date': upload_date,
1752                 'title': video_title,
1753                 'ext': extension,
1754                 'format': format,
1755                 'thumbnail': thumbnail,
1756                 'description': video_description
1757             })
1758
1759         if self._downloader.params.get('listformats', None):
1760             self._print_formats(formats)
1761             return
1762
1763         req_format = self._downloader.params.get('format', None)
1764         self.to_screen(u'Format: %s' % req_format)
1765
1766         if req_format is None or req_format == 'best':
1767             return [formats[0]]
1768         elif req_format == 'worst':
1769             return [formats[-1]]
1770         elif req_format in ('-1', 'all'):
1771             return formats
1772         else:
1773             format = self._specific( req_format, formats )
1774             if result is None:
1775                 raise ExtractorError(u'Requested format not available')
1776             return [format]
1777
1778
1779
1780 class PornotubeIE(InfoExtractor):
1781     """Information extractor for pornotube.com."""
1782     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1783
1784     def _real_extract(self, url):
1785         mobj = re.match(self._VALID_URL, url)
1786         if mobj is None:
1787             raise ExtractorError(u'Invalid URL: %s' % url)
1788
1789         video_id = mobj.group('videoid')
1790         video_title = mobj.group('title')
1791
1792         # Get webpage content
1793         webpage = self._download_webpage(url, video_id)
1794
1795         # Get the video URL
1796         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1797         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1798         video_url = compat_urllib_parse.unquote(video_url)
1799
1800         #Get the uploaded date
1801         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1802         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1803         if upload_date: upload_date = unified_strdate(upload_date)
1804
1805         info = {'id': video_id,
1806                 'url': video_url,
1807                 'uploader': None,
1808                 'upload_date': upload_date,
1809                 'title': video_title,
1810                 'ext': 'flv',
1811                 'format': 'flv'}
1812
1813         return [info]
1814
1815 class YouJizzIE(InfoExtractor):
1816     """Information extractor for youjizz.com."""
1817     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1818
1819     def _real_extract(self, url):
1820         mobj = re.match(self._VALID_URL, url)
1821         if mobj is None:
1822             raise ExtractorError(u'Invalid URL: %s' % url)
1823
1824         video_id = mobj.group('videoid')
1825
1826         # Get webpage content
1827         webpage = self._download_webpage(url, video_id)
1828
1829         # Get the video title
1830         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1831             webpage, u'title').strip()
1832
1833         # Get the embed page
1834         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1835         if result is None:
1836             raise ExtractorError(u'ERROR: unable to extract embed page')
1837
1838         embed_page_url = result.group(0).strip()
1839         video_id = result.group('videoid')
1840
1841         webpage = self._download_webpage(embed_page_url, video_id)
1842
1843         # Get the video URL
1844         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1845             webpage, u'video URL')
1846
1847         info = {'id': video_id,
1848                 'url': video_url,
1849                 'title': video_title,
1850                 'ext': 'flv',
1851                 'format': 'flv',
1852                 'player_url': embed_page_url}
1853
1854         return [info]
1855
1856 class EightTracksIE(InfoExtractor):
1857     IE_NAME = '8tracks'
1858     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1859
1860     def _real_extract(self, url):
1861         mobj = re.match(self._VALID_URL, url)
1862         if mobj is None:
1863             raise ExtractorError(u'Invalid URL: %s' % url)
1864         playlist_id = mobj.group('id')
1865
1866         webpage = self._download_webpage(url, playlist_id)
1867
1868         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1869         data = json.loads(json_like)
1870
1871         session = str(random.randint(0, 1000000000))
1872         mix_id = data['id']
1873         track_count = data['tracks_count']
1874         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1875         next_url = first_url
1876         res = []
1877         for i in itertools.count():
1878             api_json = self._download_webpage(next_url, playlist_id,
1879                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1880                 errnote=u'Failed to download song information')
1881             api_data = json.loads(api_json)
1882             track_data = api_data[u'set']['track']
1883             info = {
1884                 'id': track_data['id'],
1885                 'url': track_data['track_file_stream_url'],
1886                 'title': track_data['performer'] + u' - ' + track_data['name'],
1887                 'raw_title': track_data['name'],
1888                 'uploader_id': data['user']['login'],
1889                 'ext': 'm4a',
1890             }
1891             res.append(info)
1892             if api_data['set']['at_last_track']:
1893                 break
1894             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1895         return res
1896
1897 class KeekIE(InfoExtractor):
1898     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1899     IE_NAME = u'keek'
1900
1901     def _real_extract(self, url):
1902         m = re.match(self._VALID_URL, url)
1903         video_id = m.group('videoID')
1904
1905         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1906         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1907         webpage = self._download_webpage(url, video_id)
1908
1909         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1910             webpage, u'title')
1911
1912         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1913             webpage, u'uploader', fatal=False)
1914
1915         info = {
1916                 'id': video_id,
1917                 'url': video_url,
1918                 'ext': 'mp4',
1919                 'title': video_title,
1920                 'thumbnail': thumbnail,
1921                 'uploader': uploader
1922         }
1923         return [info]
1924
1925 class TEDIE(InfoExtractor):
1926     _VALID_URL=r'''http://www\.ted\.com/
1927                    (
1928                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1929                         |
1930                         ((?P<type_talk>talks)) # We have a simple talk
1931                    )
1932                    (/lang/(.*?))? # The url may contain the language
1933                    /(?P<name>\w+) # Here goes the name and then ".html"
1934                    '''
1935
1936     @classmethod
1937     def suitable(cls, url):
1938         """Receives a URL and returns True if suitable for this IE."""
1939         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1940
1941     def _real_extract(self, url):
1942         m=re.match(self._VALID_URL, url, re.VERBOSE)
1943         if m.group('type_talk'):
1944             return [self._talk_info(url)]
1945         else :
1946             playlist_id=m.group('playlist_id')
1947             name=m.group('name')
1948             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1949             return [self._playlist_videos_info(url,name,playlist_id)]
1950
1951     def _playlist_videos_info(self,url,name,playlist_id=0):
1952         '''Returns the videos of the playlist'''
1953         video_RE=r'''
1954                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1955                      ([.\s]*?)data-playlist_item_id="(\d+)"
1956                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1957                      '''
1958         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1959         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1960         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1961         m_names=re.finditer(video_name_RE,webpage)
1962
1963         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1964                                                  webpage, 'playlist title')
1965
1966         playlist_entries = []
1967         for m_video, m_name in zip(m_videos,m_names):
1968             video_id=m_video.group('video_id')
1969             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1970             playlist_entries.append(self.url_result(talk_url, 'TED'))
1971         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1972
1973     def _talk_info(self, url, video_id=0):
1974         """Return the video for the talk in the url"""
1975         m = re.match(self._VALID_URL, url,re.VERBOSE)
1976         video_name = m.group('name')
1977         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1978         self.report_extraction(video_name)
1979         # If the url includes the language we get the title translated
1980         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1981                                         webpage, 'title')
1982         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1983                                     webpage, 'json data')
1984         info = json.loads(json_data)
1985         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1986                                        webpage, 'description', flags = re.DOTALL)
1987         
1988         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1989                                        webpage, 'thumbnail')
1990         info = {
1991                 'id': info['id'],
1992                 'url': info['htmlStreams'][-1]['file'],
1993                 'ext': 'mp4',
1994                 'title': title,
1995                 'thumbnail': thumbnail,
1996                 'description': desc,
1997                 }
1998         return info
1999
2000 class MySpassIE(InfoExtractor):
2001     _VALID_URL = r'http://www.myspass.de/.*'
2002
2003     def _real_extract(self, url):
2004         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2005
2006         # video id is the last path element of the URL
2007         # usually there is a trailing slash, so also try the second but last
2008         url_path = compat_urllib_parse_urlparse(url).path
2009         url_parent_path, video_id = os.path.split(url_path)
2010         if not video_id:
2011             _, video_id = os.path.split(url_parent_path)
2012
2013         # get metadata
2014         metadata_url = META_DATA_URL_TEMPLATE % video_id
2015         metadata_text = self._download_webpage(metadata_url, video_id)
2016         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2017
2018         # extract values from metadata
2019         url_flv_el = metadata.find('url_flv')
2020         if url_flv_el is None:
2021             raise ExtractorError(u'Unable to extract download url')
2022         video_url = url_flv_el.text
2023         extension = os.path.splitext(video_url)[1][1:]
2024         title_el = metadata.find('title')
2025         if title_el is None:
2026             raise ExtractorError(u'Unable to extract title')
2027         title = title_el.text
2028         format_id_el = metadata.find('format_id')
2029         if format_id_el is None:
2030             format = ext
2031         else:
2032             format = format_id_el.text
2033         description_el = metadata.find('description')
2034         if description_el is not None:
2035             description = description_el.text
2036         else:
2037             description = None
2038         imagePreview_el = metadata.find('imagePreview')
2039         if imagePreview_el is not None:
2040             thumbnail = imagePreview_el.text
2041         else:
2042             thumbnail = None
2043         info = {
2044             'id': video_id,
2045             'url': video_url,
2046             'title': title,
2047             'ext': extension,
2048             'format': format,
2049             'thumbnail': thumbnail,
2050             'description': description
2051         }
2052         return [info]
2053
2054 class SpiegelIE(InfoExtractor):
2055     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2056
2057     def _real_extract(self, url):
2058         m = re.match(self._VALID_URL, url)
2059         video_id = m.group('videoID')
2060
2061         webpage = self._download_webpage(url, video_id)
2062
2063         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2064             webpage, u'title')
2065
2066         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2067         xml_code = self._download_webpage(xml_url, video_id,
2068                     note=u'Downloading XML', errnote=u'Failed to download XML')
2069
2070         idoc = xml.etree.ElementTree.fromstring(xml_code)
2071         last_type = idoc[-1]
2072         filename = last_type.findall('./filename')[0].text
2073         duration = float(last_type.findall('./duration')[0].text)
2074
2075         video_url = 'http://video2.spiegel.de/flash/' + filename
2076         video_ext = filename.rpartition('.')[2]
2077         info = {
2078             'id': video_id,
2079             'url': video_url,
2080             'ext': video_ext,
2081             'title': video_title,
2082             'duration': duration,
2083         }
2084         return [info]
2085
2086 class LiveLeakIE(InfoExtractor):
2087
2088     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2089     IE_NAME = u'liveleak'
2090
2091     def _real_extract(self, url):
2092         mobj = re.match(self._VALID_URL, url)
2093         if mobj is None:
2094             raise ExtractorError(u'Invalid URL: %s' % url)
2095
2096         video_id = mobj.group('video_id')
2097
2098         webpage = self._download_webpage(url, video_id)
2099
2100         video_url = self._search_regex(r'file: "(.*?)",',
2101             webpage, u'video URL')
2102
2103         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2104             webpage, u'title').replace('LiveLeak.com -', '').strip()
2105
2106         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2107             webpage, u'description', fatal=False)
2108
2109         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2110             webpage, u'uploader', fatal=False)
2111
2112         info = {
2113             'id':  video_id,
2114             'url': video_url,
2115             'ext': 'mp4',
2116             'title': video_title,
2117             'description': video_description,
2118             'uploader': video_uploader
2119         }
2120
2121         return [info]
2122
2123
2124
2125 class TumblrIE(InfoExtractor):
2126     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2127
2128     def _real_extract(self, url):
2129         m_url = re.match(self._VALID_URL, url)
2130         video_id = m_url.group('id')
2131         blog = m_url.group('blog_name')
2132
2133         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2134         webpage = self._download_webpage(url, video_id)
2135
2136         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2137         video = re.search(re_video, webpage)
2138         if video is None:
2139            raise ExtractorError(u'Unable to extract video')
2140         video_url = video.group('video_url')
2141         ext = video.group('ext')
2142
2143         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2144             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2145         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2146
2147         # The only place where you can get a title, it's not complete,
2148         # but searching in other places doesn't work for all videos
2149         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2150             webpage, u'title', flags=re.DOTALL)
2151
2152         return [{'id': video_id,
2153                  'url': video_url,
2154                  'title': video_title,
2155                  'thumbnail': video_thumbnail,
2156                  'ext': ext
2157                  }]
2158
2159 class BandcampIE(InfoExtractor):
2160     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         title = mobj.group('title')
2165         webpage = self._download_webpage(url, title)
2166         # We get the link to the free download page
2167         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2168         if m_download is None:
2169             raise ExtractorError(u'No free songs found')
2170
2171         download_link = m_download.group(1)
2172         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
2173                        webpage, re.MULTILINE|re.DOTALL).group('id')
2174
2175         download_webpage = self._download_webpage(download_link, id,
2176                                                   'Downloading free downloads page')
2177         # We get the dictionary of the track from some javascrip code
2178         info = re.search(r'items: (.*?),$',
2179                          download_webpage, re.MULTILINE).group(1)
2180         info = json.loads(info)[0]
2181         # We pick mp3-320 for now, until format selection can be easily implemented.
2182         mp3_info = info[u'downloads'][u'mp3-320']
2183         # If we try to use this url it says the link has expired
2184         initial_url = mp3_info[u'url']
2185         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2186         m_url = re.match(re_url, initial_url)
2187         #We build the url we will use to get the final track url
2188         # This url is build in Bandcamp in the script download_bunde_*.js
2189         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2190         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2191         # If we could correctly generate the .rand field the url would be
2192         #in the "download_url" key
2193         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2194
2195         track_info = {'id':id,
2196                       'title' : info[u'title'],
2197                       'ext' :   'mp3',
2198                       'url' :   final_url,
2199                       'thumbnail' : info[u'thumb_url'],
2200                       'uploader' :  info[u'artist']
2201                       }
2202
2203         return [track_info]
2204
2205 class RedTubeIE(InfoExtractor):
2206     """Information Extractor for redtube"""
2207     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2208
2209     def _real_extract(self,url):
2210         mobj = re.match(self._VALID_URL, url)
2211         if mobj is None:
2212             raise ExtractorError(u'Invalid URL: %s' % url)
2213
2214         video_id = mobj.group('id')
2215         video_extension = 'mp4'        
2216         webpage = self._download_webpage(url, video_id)
2217
2218         self.report_extraction(video_id)
2219
2220         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2221             webpage, u'video URL')
2222
2223         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2224             webpage, u'title')
2225
2226         return [{
2227             'id':       video_id,
2228             'url':      video_url,
2229             'ext':      video_extension,
2230             'title':    video_title,
2231         }]
2232         
2233 class InaIE(InfoExtractor):
2234     """Information Extractor for Ina.fr"""
2235     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2236
2237     def _real_extract(self,url):
2238         mobj = re.match(self._VALID_URL, url)
2239
2240         video_id = mobj.group('id')
2241         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2242         video_extension = 'mp4'
2243         webpage = self._download_webpage(mrss_url, video_id)
2244
2245         self.report_extraction(video_id)
2246
2247         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2248             webpage, u'video URL')
2249
2250         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2251             webpage, u'title')
2252
2253         return [{
2254             'id':       video_id,
2255             'url':      video_url,
2256             'ext':      video_extension,
2257             'title':    video_title,
2258         }]
2259
2260 class HowcastIE(InfoExtractor):
2261     """Information Extractor for Howcast.com"""
2262     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2263
2264     def _real_extract(self, url):
2265         mobj = re.match(self._VALID_URL, url)
2266
2267         video_id = mobj.group('id')
2268         webpage_url = 'http://www.howcast.com/videos/' + video_id
2269         webpage = self._download_webpage(webpage_url, video_id)
2270
2271         self.report_extraction(video_id)
2272
2273         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2274             webpage, u'video URL')
2275
2276         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2277             webpage, u'title')
2278
2279         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2280             webpage, u'description', fatal=False)
2281
2282         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2283             webpage, u'thumbnail', fatal=False)
2284
2285         return [{
2286             'id':       video_id,
2287             'url':      video_url,
2288             'ext':      'mp4',
2289             'title':    video_title,
2290             'description': video_description,
2291             'thumbnail': thumbnail,
2292         }]
2293
2294 class VineIE(InfoExtractor):
2295     """Information Extractor for Vine.co"""
2296     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2297
2298     def _real_extract(self, url):
2299         mobj = re.match(self._VALID_URL, url)
2300
2301         video_id = mobj.group('id')
2302         webpage_url = 'https://vine.co/v/' + video_id
2303         webpage = self._download_webpage(webpage_url, video_id)
2304
2305         self.report_extraction(video_id)
2306
2307         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2308             webpage, u'video URL')
2309
2310         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2311             webpage, u'title')
2312
2313         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2314             webpage, u'thumbnail', fatal=False)
2315
2316         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2317             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2318
2319         return [{
2320             'id':        video_id,
2321             'url':       video_url,
2322             'ext':       'mp4',
2323             'title':     video_title,
2324             'thumbnail': thumbnail,
2325             'uploader':  uploader,
2326         }]
2327
2328 class FlickrIE(InfoExtractor):
2329     """Information Extractor for Flickr videos"""
2330     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2331
2332     def _real_extract(self, url):
2333         mobj = re.match(self._VALID_URL, url)
2334
2335         video_id = mobj.group('id')
2336         video_uploader_id = mobj.group('uploader_id')
2337         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2338         webpage = self._download_webpage(webpage_url, video_id)
2339
2340         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2341
2342         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2343         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2344
2345         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2346             first_xml, u'node_id')
2347
2348         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2349         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2350
2351         self.report_extraction(video_id)
2352
2353         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2354         if mobj is None:
2355             raise ExtractorError(u'Unable to extract video url')
2356         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2357
2358         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2359             webpage, u'video title')
2360
2361         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2362             webpage, u'description', fatal=False)
2363
2364         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2365             webpage, u'thumbnail', fatal=False)
2366
2367         return [{
2368             'id':          video_id,
2369             'url':         video_url,
2370             'ext':         'mp4',
2371             'title':       video_title,
2372             'description': video_description,
2373             'thumbnail':   thumbnail,
2374             'uploader_id': video_uploader_id,
2375         }]
2376
2377 class TeamcocoIE(InfoExtractor):
2378     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2379
2380     def _real_extract(self, url):
2381         mobj = re.match(self._VALID_URL, url)
2382         if mobj is None:
2383             raise ExtractorError(u'Invalid URL: %s' % url)
2384         url_title = mobj.group('url_title')
2385         webpage = self._download_webpage(url, url_title)
2386
2387         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2388             webpage, u'video id')
2389
2390         self.report_extraction(video_id)
2391
2392         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2393             webpage, u'title')
2394
2395         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2396             webpage, u'thumbnail', fatal=False)
2397
2398         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2399             webpage, u'description', fatal=False)
2400
2401         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2402         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2403
2404         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2405             data, u'video URL')
2406
2407         return [{
2408             'id':          video_id,
2409             'url':         video_url,
2410             'ext':         'mp4',
2411             'title':       video_title,
2412             'thumbnail':   thumbnail,
2413             'description': video_description,
2414         }]
2415
2416 class XHamsterIE(InfoExtractor):
2417     """Information Extractor for xHamster"""
2418     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2419
2420     def _real_extract(self,url):
2421         mobj = re.match(self._VALID_URL, url)
2422
2423         video_id = mobj.group('id')
2424         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2425         webpage = self._download_webpage(mrss_url, video_id)
2426
2427         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2428         if mobj is None:
2429             raise ExtractorError(u'Unable to extract media URL')
2430         if len(mobj.group('server')) == 0:
2431             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2432         else:
2433             video_url = mobj.group('server')+'/key='+mobj.group('file')
2434         video_extension = video_url.split('.')[-1]
2435
2436         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2437             webpage, u'title')
2438
2439         # Can't see the description anywhere in the UI
2440         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2441         #     webpage, u'description', fatal=False)
2442         # if video_description: video_description = unescapeHTML(video_description)
2443
2444         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2445         if mobj:
2446             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2447         else:
2448             video_upload_date = None
2449             self._downloader.report_warning(u'Unable to extract upload date')
2450
2451         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2452             webpage, u'uploader id', default=u'anonymous')
2453
2454         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2455             webpage, u'thumbnail', fatal=False)
2456
2457         return [{
2458             'id':       video_id,
2459             'url':      video_url,
2460             'ext':      video_extension,
2461             'title':    video_title,
2462             # 'description': video_description,
2463             'upload_date': video_upload_date,
2464             'uploader_id': video_uploader_id,
2465             'thumbnail': video_thumbnail
2466         }]
2467
2468 class HypemIE(InfoExtractor):
2469     """Information Extractor for hypem"""
2470     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2471
2472     def _real_extract(self, url):
2473         mobj = re.match(self._VALID_URL, url)
2474         if mobj is None:
2475             raise ExtractorError(u'Invalid URL: %s' % url)
2476         track_id = mobj.group(1)
2477
2478         data = { 'ax': 1, 'ts': time.time() }
2479         data_encoded = compat_urllib_parse.urlencode(data)
2480         complete_url = url + "?" + data_encoded
2481         request = compat_urllib_request.Request(complete_url)
2482         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2483         cookie = urlh.headers.get('Set-Cookie', '')
2484
2485         self.report_extraction(track_id)
2486
2487         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2488             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2489         try:
2490             track_list = json.loads(html_tracks)
2491             track = track_list[u'tracks'][0]
2492         except ValueError:
2493             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2494
2495         key = track[u"key"]
2496         track_id = track[u"id"]
2497         artist = track[u"artist"]
2498         title = track[u"song"]
2499
2500         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2501         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2502         request.add_header('cookie', cookie)
2503         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2504         try:
2505             song_data = json.loads(song_data_json)
2506         except ValueError:
2507             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2508         final_url = song_data[u"url"]
2509
2510         return [{
2511             'id':       track_id,
2512             'url':      final_url,
2513             'ext':      "mp3",
2514             'title':    title,
2515             'artist':   artist,
2516         }]
2517
2518 class Vbox7IE(InfoExtractor):
2519     """Information Extractor for Vbox7"""
2520     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2521
2522     def _real_extract(self,url):
2523         mobj = re.match(self._VALID_URL, url)
2524         if mobj is None:
2525             raise ExtractorError(u'Invalid URL: %s' % url)
2526         video_id = mobj.group(1)
2527
2528         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2529         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2530         redirect_url = urlh.geturl() + new_location
2531         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2532
2533         title = self._html_search_regex(r'<title>(.*)</title>',
2534             webpage, u'title').split('/')[0].strip()
2535
2536         ext = "flv"
2537         info_url = "http://vbox7.com/play/magare.do"
2538         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2539         info_request = compat_urllib_request.Request(info_url, data)
2540         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2541         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2542         if info_response is None:
2543             raise ExtractorError(u'Unable to extract the media url')
2544         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2545
2546         return [{
2547             'id':        video_id,
2548             'url':       final_url,
2549             'ext':       ext,
2550             'title':     title,
2551             'thumbnail': thumbnail_url,
2552         }]
2553
2554
2555 def gen_extractors():
2556     """ Return a list of an instance of every supported extractor.
2557     The order does matter; the first extractor matched is the one handling the URL.
2558     """
2559     return [
2560         YoutubePlaylistIE(),
2561         YoutubeChannelIE(),
2562         YoutubeUserIE(),
2563         YoutubeSearchIE(),
2564         YoutubeIE(),
2565         MetacafeIE(),
2566         DailymotionIE(),
2567         GoogleSearchIE(),
2568         PhotobucketIE(),
2569         YahooIE(),
2570         YahooSearchIE(),
2571         DepositFilesIE(),
2572         FacebookIE(),
2573         BlipTVIE(),
2574         BlipTVUserIE(),
2575         VimeoIE(),
2576         MyVideoIE(),
2577         ComedyCentralIE(),
2578         EscapistIE(),
2579         CollegeHumorIE(),
2580         XVideosIE(),
2581         SoundcloudSetIE(),
2582         SoundcloudIE(),
2583         InfoQIE(),
2584         MixcloudIE(),
2585         StanfordOpenClassroomIE(),
2586         MTVIE(),
2587         YoukuIE(),
2588         XNXXIE(),
2589         YouJizzIE(),
2590         PornotubeIE(),
2591         YouPornIE(),
2592         GooglePlusIE(),
2593         ArteTvIE(),
2594         NBAIE(),
2595         WorldStarHipHopIE(),
2596         JustinTVIE(),
2597         FunnyOrDieIE(),
2598         SteamIE(),
2599         UstreamIE(),
2600         RBMARadioIE(),
2601         EightTracksIE(),
2602         KeekIE(),
2603         TEDIE(),
2604         MySpassIE(),
2605         SpiegelIE(),
2606         LiveLeakIE(),
2607         ARDIE(),
2608         ZDFIE(),
2609         TumblrIE(),
2610         BandcampIE(),
2611         RedTubeIE(),
2612         InaIE(),
2613         HowcastIE(),
2614         VineIE(),
2615         FlickrIE(),
2616         TeamcocoIE(),
2617         XHamsterIE(),
2618         HypemIE(),
2619         Vbox7IE(),
2620         GametrailersIE(),
2621         StatigramIE(),
2622         GenericIE()
2623     ]
2624
2625 def get_info_extractor(ie_name):
2626     """Returns the info extractor class with the given ie_name"""
2627     return globals()[ie_name+'IE']