Move YoutubeSearchIE to the other youtube IEs
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
25
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.metacafe import MetacafeIE
30 from .extractor.statigram import StatigramIE
31 from .extractor.photobucket import PhotobucketIE
32 from .extractor.vimeo import VimeoIE
33 from .extractor.yahoo import YahooIE
34 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
35 from .extractor.zdf import ZDFIE
36
37
38
39
40
41
42
43
44
45
46
47 class GenericIE(InfoExtractor):
48     """Generic last-resort information extractor."""
49
50     _VALID_URL = r'.*'
51     IE_NAME = u'generic'
52
53     def report_download_webpage(self, video_id):
54         """Report webpage download."""
55         if not self._downloader.params.get('test', False):
56             self._downloader.report_warning(u'Falling back on generic information extractor.')
57         super(GenericIE, self).report_download_webpage(video_id)
58
59     def report_following_redirect(self, new_url):
60         """Report information extraction."""
61         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
62
63     def _test_redirect(self, url):
64         """Check if it is a redirect, like url shorteners, in case return the new url."""
65         class HeadRequest(compat_urllib_request.Request):
66             def get_method(self):
67                 return "HEAD"
68
69         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
70             """
71             Subclass the HTTPRedirectHandler to make it use our
72             HeadRequest also on the redirected URL
73             """
74             def redirect_request(self, req, fp, code, msg, headers, newurl):
75                 if code in (301, 302, 303, 307):
76                     newurl = newurl.replace(' ', '%20')
77                     newheaders = dict((k,v) for k,v in req.headers.items()
78                                       if k.lower() not in ("content-length", "content-type"))
79                     return HeadRequest(newurl,
80                                        headers=newheaders,
81                                        origin_req_host=req.get_origin_req_host(),
82                                        unverifiable=True)
83                 else:
84                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
85
86         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
87             """
88             Fallback to GET if HEAD is not allowed (405 HTTP error)
89             """
90             def http_error_405(self, req, fp, code, msg, headers):
91                 fp.read()
92                 fp.close()
93
94                 newheaders = dict((k,v) for k,v in req.headers.items()
95                                   if k.lower() not in ("content-length", "content-type"))
96                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
97                                                  headers=newheaders,
98                                                  origin_req_host=req.get_origin_req_host(),
99                                                  unverifiable=True))
100
101         # Build our opener
102         opener = compat_urllib_request.OpenerDirector()
103         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
104                         HTTPMethodFallback, HEADRedirectHandler,
105                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
106             opener.add_handler(handler())
107
108         response = opener.open(HeadRequest(url))
109         if response is None:
110             raise ExtractorError(u'Invalid URL protocol')
111         new_url = response.geturl()
112
113         if url == new_url:
114             return False
115
116         self.report_following_redirect(new_url)
117         return new_url
118
119     def _real_extract(self, url):
120         new_url = self._test_redirect(url)
121         if new_url: return [self.url_result(new_url)]
122
123         video_id = url.split('/')[-1]
124         try:
125             webpage = self._download_webpage(url, video_id)
126         except ValueError as err:
127             # since this is the last-resort InfoExtractor, if
128             # this error is thrown, it'll be thrown here
129             raise ExtractorError(u'Invalid URL: %s' % url)
130
131         self.report_extraction(video_id)
132         # Start with something easy: JW Player in SWFObject
133         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
134         if mobj is None:
135             # Broaden the search a little bit
136             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
137         if mobj is None:
138             # Broaden the search a little bit: JWPlayer JS loader
139             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
140         if mobj is None:
141             # Try to find twitter cards info
142             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
143         if mobj is None:
144             # We look for Open Graph info:
145             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
146             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
147             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
148             if m_video_type is not None:
149                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
150         if mobj is None:
151             raise ExtractorError(u'Invalid URL: %s' % url)
152
153         # It's possible that one of the regexes
154         # matched, but returned an empty group:
155         if mobj.group(1) is None:
156             raise ExtractorError(u'Invalid URL: %s' % url)
157
158         video_url = compat_urllib_parse.unquote(mobj.group(1))
159         video_id = os.path.basename(video_url)
160
161         # here's a fun little line of code for you:
162         video_extension = os.path.splitext(video_id)[1][1:]
163         video_id = os.path.splitext(video_id)[0]
164
165         # it's tempting to parse this further, but you would
166         # have to take into account all the variations like
167         #   Video Title - Site Name
168         #   Site Name | Video Title
169         #   Video Title - Tagline | Site Name
170         # and so on and so forth; it's just not practical
171         video_title = self._html_search_regex(r'<title>(.*)</title>',
172             webpage, u'video title')
173
174         # video uploader is domain name
175         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
176             url, u'video uploader')
177
178         return [{
179             'id':       video_id,
180             'url':      video_url,
181             'uploader': video_uploader,
182             'upload_date':  None,
183             'title':    video_title,
184             'ext':      video_extension,
185         }]
186
187
188
189 class GoogleSearchIE(SearchInfoExtractor):
190     """Information Extractor for Google Video search queries."""
191     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
192     _MAX_RESULTS = 1000
193     IE_NAME = u'video.google:search'
194     _SEARCH_KEY = 'gvsearch'
195
196     def _get_n_results(self, query, n):
197         """Get a specified number of results for a query"""
198
199         res = {
200             '_type': 'playlist',
201             'id': query,
202             'entries': []
203         }
204
205         for pagenum in itertools.count(1):
206             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
207             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
208                                              note='Downloading result page ' + str(pagenum))
209
210             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
211                 e = {
212                     '_type': 'url',
213                     'url': mobj.group(1)
214                 }
215                 res['entries'].append(e)
216
217             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
218                 return res
219
220 class YahooSearchIE(SearchInfoExtractor):
221     """Information Extractor for Yahoo! Video search queries."""
222
223     _MAX_RESULTS = 1000
224     IE_NAME = u'screen.yahoo:search'
225     _SEARCH_KEY = 'yvsearch'
226
227     def _get_n_results(self, query, n):
228         """Get a specified number of results for a query"""
229
230         res = {
231             '_type': 'playlist',
232             'id': query,
233             'entries': []
234         }
235         for pagenum in itertools.count(0): 
236             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
237             webpage = self._download_webpage(result_url, query,
238                                              note='Downloading results page '+str(pagenum+1))
239             info = json.loads(webpage)
240             m = info[u'm']
241             results = info[u'results']
242
243             for (i, r) in enumerate(results):
244                 if (pagenum * 30) +i >= n:
245                     break
246                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
247                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
248                 res['entries'].append(e)
249             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
250                 break
251
252         return res
253
254
255 class BlipTVUserIE(InfoExtractor):
256     """Information Extractor for blip.tv users."""
257
258     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
259     _PAGE_SIZE = 12
260     IE_NAME = u'blip.tv:user'
261
262     def _real_extract(self, url):
263         # Extract username
264         mobj = re.match(self._VALID_URL, url)
265         if mobj is None:
266             raise ExtractorError(u'Invalid URL: %s' % url)
267
268         username = mobj.group(1)
269
270         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
271
272         page = self._download_webpage(url, username, u'Downloading user page')
273         mobj = re.search(r'data-users-id="([^"]+)"', page)
274         page_base = page_base % mobj.group(1)
275
276
277         # Download video ids using BlipTV Ajax calls. Result size per
278         # query is limited (currently to 12 videos) so we need to query
279         # page by page until there are no video ids - it means we got
280         # all of them.
281
282         video_ids = []
283         pagenum = 1
284
285         while True:
286             url = page_base + "&page=" + str(pagenum)
287             page = self._download_webpage(url, username,
288                                           u'Downloading video ids from page %d' % pagenum)
289
290             # Extract video identifiers
291             ids_in_page = []
292
293             for mobj in re.finditer(r'href="/([^"]+)"', page):
294                 if mobj.group(1) not in ids_in_page:
295                     ids_in_page.append(unescapeHTML(mobj.group(1)))
296
297             video_ids.extend(ids_in_page)
298
299             # A little optimization - if current page is not
300             # "full", ie. does not contain PAGE_SIZE video ids then
301             # we can assume that this page is the last one - there
302             # are no more ids on further pages - no need to query
303             # again.
304
305             if len(ids_in_page) < self._PAGE_SIZE:
306                 break
307
308             pagenum += 1
309
310         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
311         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
312         return [self.playlist_result(url_entries, playlist_title = username)]
313
314
315 class DepositFilesIE(InfoExtractor):
316     """Information extractor for depositfiles.com"""
317
318     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
319
320     def _real_extract(self, url):
321         file_id = url.split('/')[-1]
322         # Rebuild url in english locale
323         url = 'http://depositfiles.com/en/files/' + file_id
324
325         # Retrieve file webpage with 'Free download' button pressed
326         free_download_indication = { 'gateway_result' : '1' }
327         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
328         try:
329             self.report_download_webpage(file_id)
330             webpage = compat_urllib_request.urlopen(request).read()
331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
333
334         # Search for the real file URL
335         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
336         if (mobj is None) or (mobj.group(1) is None):
337             # Try to figure out reason of the error.
338             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
339             if (mobj is not None) and (mobj.group(1) is not None):
340                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
341                 raise ExtractorError(u'%s' % restriction_message)
342             else:
343                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
344
345         file_url = mobj.group(1)
346         file_extension = os.path.splitext(file_url)[1][1:]
347
348         # Search for file title
349         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
350
351         return [{
352             'id':       file_id.decode('utf-8'),
353             'url':      file_url.decode('utf-8'),
354             'uploader': None,
355             'upload_date':  None,
356             'title':    file_title,
357             'ext':      file_extension.decode('utf-8'),
358         }]
359
360
361 class FacebookIE(InfoExtractor):
362     """Information Extractor for Facebook"""
363
364     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
365     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
366     _NETRC_MACHINE = 'facebook'
367     IE_NAME = u'facebook'
368
369     def report_login(self):
370         """Report attempt to log in."""
371         self.to_screen(u'Logging in')
372
373     def _real_initialize(self):
374         if self._downloader is None:
375             return
376
377         useremail = None
378         password = None
379         downloader_params = self._downloader.params
380
381         # Attempt to use provided username and password or .netrc data
382         if downloader_params.get('username', None) is not None:
383             useremail = downloader_params['username']
384             password = downloader_params['password']
385         elif downloader_params.get('usenetrc', False):
386             try:
387                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388                 if info is not None:
389                     useremail = info[0]
390                     password = info[2]
391                 else:
392                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
393             except (IOError, netrc.NetrcParseError) as err:
394                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
395                 return
396
397         if useremail is None:
398             return
399
400         # Log in
401         login_form = {
402             'email': useremail,
403             'pass': password,
404             'login': 'Log+In'
405             }
406         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
407         try:
408             self.report_login()
409             login_results = compat_urllib_request.urlopen(request).read()
410             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
411                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
412                 return
413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
415             return
416
417     def _real_extract(self, url):
418         mobj = re.match(self._VALID_URL, url)
419         if mobj is None:
420             raise ExtractorError(u'Invalid URL: %s' % url)
421         video_id = mobj.group('ID')
422
423         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
424         webpage = self._download_webpage(url, video_id)
425
426         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
427         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
428         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
429         if not m:
430             raise ExtractorError(u'Cannot parse data')
431         data = dict(json.loads(m.group(1)))
432         params_raw = compat_urllib_parse.unquote(data['params'])
433         params = json.loads(params_raw)
434         video_data = params['video_data'][0]
435         video_url = video_data.get('hd_src')
436         if not video_url:
437             video_url = video_data['sd_src']
438         if not video_url:
439             raise ExtractorError(u'Cannot find video URL')
440         video_duration = int(video_data['video_duration'])
441         thumbnail = video_data['thumbnail_src']
442
443         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
444             webpage, u'title')
445
446         info = {
447             'id': video_id,
448             'title': video_title,
449             'url': video_url,
450             'ext': 'mp4',
451             'duration': video_duration,
452             'thumbnail': thumbnail,
453         }
454         return [info]
455
456
457 class BlipTVIE(InfoExtractor):
458     """Information extractor for blip.tv"""
459
460     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
461     _URL_EXT = r'^.*\.([a-z0-9]+)$'
462     IE_NAME = u'blip.tv'
463
464     def report_direct_download(self, title):
465         """Report information extraction."""
466         self.to_screen(u'%s: Direct download detected' % title)
467
468     def _real_extract(self, url):
469         mobj = re.match(self._VALID_URL, url)
470         if mobj is None:
471             raise ExtractorError(u'Invalid URL: %s' % url)
472
473         # See https://github.com/rg3/youtube-dl/issues/857
474         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
475         if api_mobj is not None:
476             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
477         urlp = compat_urllib_parse_urlparse(url)
478         if urlp.path.startswith('/play/'):
479             request = compat_urllib_request.Request(url)
480             response = compat_urllib_request.urlopen(request)
481             redirecturl = response.geturl()
482             rurlp = compat_urllib_parse_urlparse(redirecturl)
483             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
484             url = 'http://blip.tv/a/a-' + file_id
485             return self._real_extract(url)
486
487
488         if '?' in url:
489             cchar = '&'
490         else:
491             cchar = '?'
492         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
493         request = compat_urllib_request.Request(json_url)
494         request.add_header('User-Agent', 'iTunes/10.6.1')
495         self.report_extraction(mobj.group(1))
496         info = None
497         try:
498             urlh = compat_urllib_request.urlopen(request)
499             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
500                 basename = url.split('/')[-1]
501                 title,ext = os.path.splitext(basename)
502                 title = title.decode('UTF-8')
503                 ext = ext.replace('.', '')
504                 self.report_direct_download(title)
505                 info = {
506                     'id': title,
507                     'url': url,
508                     'uploader': None,
509                     'upload_date': None,
510                     'title': title,
511                     'ext': ext,
512                     'urlhandle': urlh
513                 }
514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
516         if info is None: # Regular URL
517             try:
518                 json_code_bytes = urlh.read()
519                 json_code = json_code_bytes.decode('utf-8')
520             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
521                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
522
523             try:
524                 json_data = json.loads(json_code)
525                 if 'Post' in json_data:
526                     data = json_data['Post']
527                 else:
528                     data = json_data
529
530                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
531                 video_url = data['media']['url']
532                 umobj = re.match(self._URL_EXT, video_url)
533                 if umobj is None:
534                     raise ValueError('Can not determine filename extension')
535                 ext = umobj.group(1)
536
537                 info = {
538                     'id': data['item_id'],
539                     'url': video_url,
540                     'uploader': data['display_name'],
541                     'upload_date': upload_date,
542                     'title': data['title'],
543                     'ext': ext,
544                     'format': data['media']['mimeType'],
545                     'thumbnail': data['thumbnailUrl'],
546                     'description': data['description'],
547                     'player_url': data['embedUrl'],
548                     'user_agent': 'iTunes/10.6.1',
549                 }
550             except (ValueError,KeyError) as err:
551                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
552
553         return [info]
554
555
556 class MyVideoIE(InfoExtractor):
557     """Information Extractor for myvideo.de."""
558
559     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
560     IE_NAME = u'myvideo'
561
562     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
563     # Released into the Public Domain by Tristan Fischer on 2013-05-19
564     # https://github.com/rg3/youtube-dl/pull/842
565     def __rc4crypt(self,data, key):
566         x = 0
567         box = list(range(256))
568         for i in list(range(256)):
569             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
570             box[i], box[x] = box[x], box[i]
571         x = 0
572         y = 0
573         out = ''
574         for char in data:
575             x = (x + 1) % 256
576             y = (y + box[x]) % 256
577             box[x], box[y] = box[y], box[x]
578             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
579         return out
580
581     def __md5(self,s):
582         return hashlib.md5(s).hexdigest().encode()
583
584     def _real_extract(self,url):
585         mobj = re.match(self._VALID_URL, url)
586         if mobj is None:
587             raise ExtractorError(u'invalid URL: %s' % url)
588
589         video_id = mobj.group(1)
590
591         GK = (
592           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
593           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
594           b'TnpsbA0KTVRkbU1tSTRNdz09'
595         )
596
597         # Get video webpage
598         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
599         webpage = self._download_webpage(webpage_url, video_id)
600
601         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
602         if mobj is not None:
603             self.report_extraction(video_id)
604             video_url = mobj.group(1) + '.flv'
605
606             video_title = self._html_search_regex('<title>([^<]+)</title>',
607                 webpage, u'title')
608
609             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
610
611             return [{
612                 'id':       video_id,
613                 'url':      video_url,
614                 'uploader': None,
615                 'upload_date':  None,
616                 'title':    video_title,
617                 'ext':      u'flv',
618             }]
619
620         # try encxml
621         mobj = re.search('var flashvars={(.+?)}', webpage)
622         if mobj is None:
623             raise ExtractorError(u'Unable to extract video')
624
625         params = {}
626         encxml = ''
627         sec = mobj.group(1)
628         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
629             if not a == '_encxml':
630                 params[a] = b
631             else:
632                 encxml = compat_urllib_parse.unquote(b)
633         if not params.get('domain'):
634             params['domain'] = 'www.myvideo.de'
635         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
636         if 'flash_playertype=MTV' in xmldata_url:
637             self._downloader.report_warning(u'avoiding MTV player')
638             xmldata_url = (
639                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
640                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
641             ) % video_id
642
643         # get enc data
644         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
645         enc_data_b = binascii.unhexlify(enc_data)
646         sk = self.__md5(
647             base64.b64decode(base64.b64decode(GK)) +
648             self.__md5(
649                 str(video_id).encode('utf-8')
650             )
651         )
652         dec_data = self.__rc4crypt(enc_data_b, sk)
653
654         # extracting infos
655         self.report_extraction(video_id)
656
657         video_url = None
658         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
659         if mobj:
660             video_url = compat_urllib_parse.unquote(mobj.group(1))
661             if 'myvideo2flash' in video_url:
662                 self._downloader.report_warning(u'forcing RTMPT ...')
663                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
664
665         if not video_url:
666             # extract non rtmp videos
667             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
668             if mobj is None:
669                 raise ExtractorError(u'unable to extract url')
670             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
671
672         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
673         video_file = compat_urllib_parse.unquote(video_file)
674
675         if not video_file.endswith('f4m'):
676             ppath, prefix = video_file.split('.')
677             video_playpath = '%s:%s' % (prefix, ppath)
678             video_hls_playlist = ''
679         else:
680             video_playpath = ''
681             video_hls_playlist = (
682                 video_filepath + video_file
683             ).replace('.f4m', '.m3u8')
684
685         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
686         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
687
688         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
689             webpage, u'title')
690
691         return [{
692             'id':                 video_id,
693             'url':                video_url,
694             'tc_url':             video_url,
695             'uploader':           None,
696             'upload_date':        None,
697             'title':              video_title,
698             'ext':                u'flv',
699             'play_path':          video_playpath,
700             'video_file':         video_file,
701             'video_hls_playlist': video_hls_playlist,
702             'player_url':         video_swfobj,
703         }]
704
705
706 class ComedyCentralIE(InfoExtractor):
707     """Information extractor for The Daily Show and Colbert Report """
708
709     # urls can be abbreviations like :thedailyshow or :colbert
710     # urls for episodes like:
711     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
712     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
713     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
714     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
715                       |(https?://)?(www\.)?
716                           (?P<showname>thedailyshow|colbertnation)\.com/
717                          (full-episodes/(?P<episode>.*)|
718                           (?P<clip>
719                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
720                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
721                      $"""
722
723     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
724
725     _video_extensions = {
726         '3500': 'mp4',
727         '2200': 'mp4',
728         '1700': 'mp4',
729         '1200': 'mp4',
730         '750': 'mp4',
731         '400': 'mp4',
732     }
733     _video_dimensions = {
734         '3500': '1280x720',
735         '2200': '960x540',
736         '1700': '768x432',
737         '1200': '640x360',
738         '750': '512x288',
739         '400': '384x216',
740     }
741
742     @classmethod
743     def suitable(cls, url):
744         """Receives a URL and returns True if suitable for this IE."""
745         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
746
747     def _print_formats(self, formats):
748         print('Available formats:')
749         for x in formats:
750             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
751
752
753     def _real_extract(self, url):
754         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
755         if mobj is None:
756             raise ExtractorError(u'Invalid URL: %s' % url)
757
758         if mobj.group('shortname'):
759             if mobj.group('shortname') in ('tds', 'thedailyshow'):
760                 url = u'http://www.thedailyshow.com/full-episodes/'
761             else:
762                 url = u'http://www.colbertnation.com/full-episodes/'
763             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
764             assert mobj is not None
765
766         if mobj.group('clip'):
767             if mobj.group('showname') == 'thedailyshow':
768                 epTitle = mobj.group('tdstitle')
769             else:
770                 epTitle = mobj.group('cntitle')
771             dlNewest = False
772         else:
773             dlNewest = not mobj.group('episode')
774             if dlNewest:
775                 epTitle = mobj.group('showname')
776             else:
777                 epTitle = mobj.group('episode')
778
779         self.report_extraction(epTitle)
780         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
781         if dlNewest:
782             url = htmlHandle.geturl()
783             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
784             if mobj is None:
785                 raise ExtractorError(u'Invalid redirected URL: ' + url)
786             if mobj.group('episode') == '':
787                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
788             epTitle = mobj.group('episode')
789
790         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
791
792         if len(mMovieParams) == 0:
793             # The Colbert Report embeds the information in a without
794             # a URL prefix; so extract the alternate reference
795             # and then add the URL prefix manually.
796
797             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
798             if len(altMovieParams) == 0:
799                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
800             else:
801                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
802
803         uri = mMovieParams[0][1]
804         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
805         indexXml = self._download_webpage(indexUrl, epTitle,
806                                           u'Downloading show index',
807                                           u'unable to download episode index')
808
809         results = []
810
811         idoc = xml.etree.ElementTree.fromstring(indexXml)
812         itemEls = idoc.findall('.//item')
813         for partNum,itemEl in enumerate(itemEls):
814             mediaId = itemEl.findall('./guid')[0].text
815             shortMediaId = mediaId.split(':')[-1]
816             showId = mediaId.split(':')[-2].replace('.com', '')
817             officialTitle = itemEl.findall('./title')[0].text
818             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
819
820             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
821                         compat_urllib_parse.urlencode({'uri': mediaId}))
822             configXml = self._download_webpage(configUrl, epTitle,
823                                                u'Downloading configuration for %s' % shortMediaId)
824
825             cdoc = xml.etree.ElementTree.fromstring(configXml)
826             turls = []
827             for rendition in cdoc.findall('.//rendition'):
828                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
829                 turls.append(finfo)
830
831             if len(turls) == 0:
832                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
833                 continue
834
835             if self._downloader.params.get('listformats', None):
836                 self._print_formats([i[0] for i in turls])
837                 return
838
839             # For now, just pick the highest bitrate
840             format,rtmp_video_url = turls[-1]
841
842             # Get the format arg from the arg stream
843             req_format = self._downloader.params.get('format', None)
844
845             # Select format if we can find one
846             for f,v in turls:
847                 if f == req_format:
848                     format, rtmp_video_url = f, v
849                     break
850
851             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
852             if not m:
853                 raise ExtractorError(u'Cannot transform RTMP url')
854             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
855             video_url = base + m.group('finalid')
856
857             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
858             info = {
859                 'id': shortMediaId,
860                 'url': video_url,
861                 'uploader': showId,
862                 'upload_date': officialDate,
863                 'title': effTitle,
864                 'ext': 'mp4',
865                 'format': format,
866                 'thumbnail': None,
867                 'description': officialTitle,
868             }
869             results.append(info)
870
871         return results
872
873
874 class EscapistIE(InfoExtractor):
875     """Information extractor for The Escapist """
876
877     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
878     IE_NAME = u'escapist'
879
880     def _real_extract(self, url):
881         mobj = re.match(self._VALID_URL, url)
882         if mobj is None:
883             raise ExtractorError(u'Invalid URL: %s' % url)
884         showName = mobj.group('showname')
885         videoId = mobj.group('episode')
886
887         self.report_extraction(videoId)
888         webpage = self._download_webpage(url, videoId)
889
890         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
891             webpage, u'description', fatal=False)
892
893         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
894             webpage, u'thumbnail', fatal=False)
895
896         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
897             webpage, u'player url')
898
899         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
900             webpage, u'player url').split(' : ')[-1]
901
902         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
903         configUrl = compat_urllib_parse.unquote(configUrl)
904
905         configJSON = self._download_webpage(configUrl, videoId,
906                                             u'Downloading configuration',
907                                             u'unable to download configuration')
908
909         # Technically, it's JavaScript, not JSON
910         configJSON = configJSON.replace("'", '"')
911
912         try:
913             config = json.loads(configJSON)
914         except (ValueError,) as err:
915             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
916
917         playlist = config['playlist']
918         videoUrl = playlist[1]['url']
919
920         info = {
921             'id': videoId,
922             'url': videoUrl,
923             'uploader': showName,
924             'upload_date': None,
925             'title': title,
926             'ext': 'mp4',
927             'thumbnail': imgUrl,
928             'description': videoDesc,
929             'player_url': playerUrl,
930         }
931
932         return [info]
933
934 class CollegeHumorIE(InfoExtractor):
935     """Information extractor for collegehumor.com"""
936
937     _WORKING = False
938     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
939     IE_NAME = u'collegehumor'
940
941     def report_manifest(self, video_id):
942         """Report information extraction."""
943         self.to_screen(u'%s: Downloading XML manifest' % video_id)
944
945     def _real_extract(self, url):
946         mobj = re.match(self._VALID_URL, url)
947         if mobj is None:
948             raise ExtractorError(u'Invalid URL: %s' % url)
949         video_id = mobj.group('videoid')
950
951         info = {
952             'id': video_id,
953             'uploader': None,
954             'upload_date': None,
955         }
956
957         self.report_extraction(video_id)
958         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
959         try:
960             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
961         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
962             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
963
964         mdoc = xml.etree.ElementTree.fromstring(metaXml)
965         try:
966             videoNode = mdoc.findall('./video')[0]
967             info['description'] = videoNode.findall('./description')[0].text
968             info['title'] = videoNode.findall('./caption')[0].text
969             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
970             manifest_url = videoNode.findall('./file')[0].text
971         except IndexError:
972             raise ExtractorError(u'Invalid metadata XML file')
973
974         manifest_url += '?hdcore=2.10.3'
975         self.report_manifest(video_id)
976         try:
977             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
978         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
980
981         adoc = xml.etree.ElementTree.fromstring(manifestXml)
982         try:
983             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
984             node_id = media_node.attrib['url']
985             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
986         except IndexError as err:
987             raise ExtractorError(u'Invalid manifest file')
988
989         url_pr = compat_urllib_parse_urlparse(manifest_url)
990         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
991
992         info['url'] = url
993         info['ext'] = 'f4f'
994         return [info]
995
996
997 class XVideosIE(InfoExtractor):
998     """Information extractor for xvideos.com"""
999
1000     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1001     IE_NAME = u'xvideos'
1002
1003     def _real_extract(self, url):
1004         mobj = re.match(self._VALID_URL, url)
1005         if mobj is None:
1006             raise ExtractorError(u'Invalid URL: %s' % url)
1007         video_id = mobj.group(1)
1008
1009         webpage = self._download_webpage(url, video_id)
1010
1011         self.report_extraction(video_id)
1012
1013         # Extract video URL
1014         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1015             webpage, u'video URL'))
1016
1017         # Extract title
1018         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1019             webpage, u'title')
1020
1021         # Extract video thumbnail
1022         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1023             webpage, u'thumbnail', fatal=False)
1024
1025         info = {
1026             'id': video_id,
1027             'url': video_url,
1028             'uploader': None,
1029             'upload_date': None,
1030             'title': video_title,
1031             'ext': 'flv',
1032             'thumbnail': video_thumbnail,
1033             'description': None,
1034         }
1035
1036         return [info]
1037
1038
1039 class SoundcloudIE(InfoExtractor):
1040     """Information extractor for soundcloud.com
1041        To access the media, the uid of the song and a stream token
1042        must be extracted from the page source and the script must make
1043        a request to media.soundcloud.com/crossdomain.xml. Then
1044        the media can be grabbed by requesting from an url composed
1045        of the stream token and uid
1046      """
1047
1048     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1049     IE_NAME = u'soundcloud'
1050
1051     def report_resolve(self, video_id):
1052         """Report information extraction."""
1053         self.to_screen(u'%s: Resolving id' % video_id)
1054
1055     def _real_extract(self, url):
1056         mobj = re.match(self._VALID_URL, url)
1057         if mobj is None:
1058             raise ExtractorError(u'Invalid URL: %s' % url)
1059
1060         # extract uploader (which is in the url)
1061         uploader = mobj.group(1)
1062         # extract simple title (uploader + slug of song title)
1063         slug_title =  mobj.group(2)
1064         simple_title = uploader + u'-' + slug_title
1065         full_title = '%s/%s' % (uploader, slug_title)
1066
1067         self.report_resolve(full_title)
1068
1069         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1070         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1071         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1072
1073         info = json.loads(info_json)
1074         video_id = info['id']
1075         self.report_extraction(full_title)
1076
1077         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1078         stream_json = self._download_webpage(streams_url, full_title,
1079                                              u'Downloading stream definitions',
1080                                              u'unable to download stream definitions')
1081
1082         streams = json.loads(stream_json)
1083         mediaURL = streams['http_mp3_128_url']
1084         upload_date = unified_strdate(info['created_at'])
1085
1086         return [{
1087             'id':       info['id'],
1088             'url':      mediaURL,
1089             'uploader': info['user']['username'],
1090             'upload_date': upload_date,
1091             'title':    info['title'],
1092             'ext':      u'mp3',
1093             'description': info['description'],
1094         }]
1095
1096 class SoundcloudSetIE(InfoExtractor):
1097     """Information extractor for soundcloud.com sets
1098        To access the media, the uid of the song and a stream token
1099        must be extracted from the page source and the script must make
1100        a request to media.soundcloud.com/crossdomain.xml. Then
1101        the media can be grabbed by requesting from an url composed
1102        of the stream token and uid
1103      """
1104
1105     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1106     IE_NAME = u'soundcloud:set'
1107
1108     def report_resolve(self, video_id):
1109         """Report information extraction."""
1110         self.to_screen(u'%s: Resolving id' % video_id)
1111
1112     def _real_extract(self, url):
1113         mobj = re.match(self._VALID_URL, url)
1114         if mobj is None:
1115             raise ExtractorError(u'Invalid URL: %s' % url)
1116
1117         # extract uploader (which is in the url)
1118         uploader = mobj.group(1)
1119         # extract simple title (uploader + slug of song title)
1120         slug_title =  mobj.group(2)
1121         simple_title = uploader + u'-' + slug_title
1122         full_title = '%s/sets/%s' % (uploader, slug_title)
1123
1124         self.report_resolve(full_title)
1125
1126         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1127         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1128         info_json = self._download_webpage(resolv_url, full_title)
1129
1130         videos = []
1131         info = json.loads(info_json)
1132         if 'errors' in info:
1133             for err in info['errors']:
1134                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1135             return
1136
1137         self.report_extraction(full_title)
1138         for track in info['tracks']:
1139             video_id = track['id']
1140
1141             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1142             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1143
1144             self.report_extraction(video_id)
1145             streams = json.loads(stream_json)
1146             mediaURL = streams['http_mp3_128_url']
1147
1148             videos.append({
1149                 'id':       video_id,
1150                 'url':      mediaURL,
1151                 'uploader': track['user']['username'],
1152                 'upload_date':  unified_strdate(track['created_at']),
1153                 'title':    track['title'],
1154                 'ext':      u'mp3',
1155                 'description': track['description'],
1156             })
1157         return videos
1158
1159
1160 class InfoQIE(InfoExtractor):
1161     """Information extractor for infoq.com"""
1162     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1163
1164     def _real_extract(self, url):
1165         mobj = re.match(self._VALID_URL, url)
1166         if mobj is None:
1167             raise ExtractorError(u'Invalid URL: %s' % url)
1168
1169         webpage = self._download_webpage(url, video_id=url)
1170         self.report_extraction(url)
1171
1172         # Extract video URL
1173         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1174         if mobj is None:
1175             raise ExtractorError(u'Unable to extract video url')
1176         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1177         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1178
1179         # Extract title
1180         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1181             webpage, u'title')
1182
1183         # Extract description
1184         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1185             webpage, u'description', fatal=False)
1186
1187         video_filename = video_url.split('/')[-1]
1188         video_id, extension = video_filename.split('.')
1189
1190         info = {
1191             'id': video_id,
1192             'url': video_url,
1193             'uploader': None,
1194             'upload_date': None,
1195             'title': video_title,
1196             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1197             'thumbnail': None,
1198             'description': video_description,
1199         }
1200
1201         return [info]
1202
1203 class MixcloudIE(InfoExtractor):
1204     """Information extractor for www.mixcloud.com"""
1205
1206     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1207     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1208     IE_NAME = u'mixcloud'
1209
1210     def report_download_json(self, file_id):
1211         """Report JSON download."""
1212         self.to_screen(u'Downloading json')
1213
1214     def get_urls(self, jsonData, fmt, bitrate='best'):
1215         """Get urls from 'audio_formats' section in json"""
1216         file_url = None
1217         try:
1218             bitrate_list = jsonData[fmt]
1219             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1220                 bitrate = max(bitrate_list) # select highest
1221
1222             url_list = jsonData[fmt][bitrate]
1223         except TypeError: # we have no bitrate info.
1224             url_list = jsonData[fmt]
1225         return url_list
1226
1227     def check_urls(self, url_list):
1228         """Returns 1st active url from list"""
1229         for url in url_list:
1230             try:
1231                 compat_urllib_request.urlopen(url)
1232                 return url
1233             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234                 url = None
1235
1236         return None
1237
1238     def _print_formats(self, formats):
1239         print('Available formats:')
1240         for fmt in formats.keys():
1241             for b in formats[fmt]:
1242                 try:
1243                     ext = formats[fmt][b][0]
1244                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1245                 except TypeError: # we have no bitrate info
1246                     ext = formats[fmt][0]
1247                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1248                     break
1249
1250     def _real_extract(self, url):
1251         mobj = re.match(self._VALID_URL, url)
1252         if mobj is None:
1253             raise ExtractorError(u'Invalid URL: %s' % url)
1254         # extract uploader & filename from url
1255         uploader = mobj.group(1).decode('utf-8')
1256         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1257
1258         # construct API request
1259         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1260         # retrieve .json file with links to files
1261         request = compat_urllib_request.Request(file_url)
1262         try:
1263             self.report_download_json(file_url)
1264             jsonData = compat_urllib_request.urlopen(request).read()
1265         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1267
1268         # parse JSON
1269         json_data = json.loads(jsonData)
1270         player_url = json_data['player_swf_url']
1271         formats = dict(json_data['audio_formats'])
1272
1273         req_format = self._downloader.params.get('format', None)
1274         bitrate = None
1275
1276         if self._downloader.params.get('listformats', None):
1277             self._print_formats(formats)
1278             return
1279
1280         if req_format is None or req_format == 'best':
1281             for format_param in formats.keys():
1282                 url_list = self.get_urls(formats, format_param)
1283                 # check urls
1284                 file_url = self.check_urls(url_list)
1285                 if file_url is not None:
1286                     break # got it!
1287         else:
1288             if req_format not in formats:
1289                 raise ExtractorError(u'Format is not available')
1290
1291             url_list = self.get_urls(formats, req_format)
1292             file_url = self.check_urls(url_list)
1293             format_param = req_format
1294
1295         return [{
1296             'id': file_id.decode('utf-8'),
1297             'url': file_url.decode('utf-8'),
1298             'uploader': uploader.decode('utf-8'),
1299             'upload_date': None,
1300             'title': json_data['name'],
1301             'ext': file_url.split('.')[-1].decode('utf-8'),
1302             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1303             'thumbnail': json_data['thumbnail_url'],
1304             'description': json_data['description'],
1305             'player_url': player_url.decode('utf-8'),
1306         }]
1307
1308 class StanfordOpenClassroomIE(InfoExtractor):
1309     """Information extractor for Stanford's Open ClassRoom"""
1310
1311     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1312     IE_NAME = u'stanfordoc'
1313
1314     def _real_extract(self, url):
1315         mobj = re.match(self._VALID_URL, url)
1316         if mobj is None:
1317             raise ExtractorError(u'Invalid URL: %s' % url)
1318
1319         if mobj.group('course') and mobj.group('video'): # A specific video
1320             course = mobj.group('course')
1321             video = mobj.group('video')
1322             info = {
1323                 'id': course + '_' + video,
1324                 'uploader': None,
1325                 'upload_date': None,
1326             }
1327
1328             self.report_extraction(info['id'])
1329             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1330             xmlUrl = baseUrl + video + '.xml'
1331             try:
1332                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1333             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1334                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1335             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1336             try:
1337                 info['title'] = mdoc.findall('./title')[0].text
1338                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1339             except IndexError:
1340                 raise ExtractorError(u'Invalid metadata XML file')
1341             info['ext'] = info['url'].rpartition('.')[2]
1342             return [info]
1343         elif mobj.group('course'): # A course page
1344             course = mobj.group('course')
1345             info = {
1346                 'id': course,
1347                 'type': 'playlist',
1348                 'uploader': None,
1349                 'upload_date': None,
1350             }
1351
1352             coursepage = self._download_webpage(url, info['id'],
1353                                         note='Downloading course info page',
1354                                         errnote='Unable to download course info page')
1355
1356             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1357
1358             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1359                 coursepage, u'description', fatal=False)
1360
1361             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1362             info['list'] = [
1363                 {
1364                     'type': 'reference',
1365                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1366                 }
1367                     for vpage in links]
1368             results = []
1369             for entry in info['list']:
1370                 assert entry['type'] == 'reference'
1371                 results += self.extract(entry['url'])
1372             return results
1373         else: # Root page
1374             info = {
1375                 'id': 'Stanford OpenClassroom',
1376                 'type': 'playlist',
1377                 'uploader': None,
1378                 'upload_date': None,
1379             }
1380
1381             self.report_download_webpage(info['id'])
1382             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1383             try:
1384                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1385             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1386                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1387
1388             info['title'] = info['id']
1389
1390             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1391             info['list'] = [
1392                 {
1393                     'type': 'reference',
1394                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1395                 }
1396                     for cpage in links]
1397
1398             results = []
1399             for entry in info['list']:
1400                 assert entry['type'] == 'reference'
1401                 results += self.extract(entry['url'])
1402             return results
1403
1404 class MTVIE(InfoExtractor):
1405     """Information extractor for MTV.com"""
1406
1407     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1408     IE_NAME = u'mtv'
1409
1410     def _real_extract(self, url):
1411         mobj = re.match(self._VALID_URL, url)
1412         if mobj is None:
1413             raise ExtractorError(u'Invalid URL: %s' % url)
1414         if not mobj.group('proto'):
1415             url = 'http://' + url
1416         video_id = mobj.group('videoid')
1417
1418         webpage = self._download_webpage(url, video_id)
1419
1420         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1421             webpage, u'song name', fatal=False)
1422
1423         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1424             webpage, u'title')
1425
1426         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1427             webpage, u'mtvn_uri', fatal=False)
1428
1429         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1430             webpage, u'content id', fatal=False)
1431
1432         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1433         self.report_extraction(video_id)
1434         request = compat_urllib_request.Request(videogen_url)
1435         try:
1436             metadataXml = compat_urllib_request.urlopen(request).read()
1437         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1438             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1439
1440         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1441         renditions = mdoc.findall('.//rendition')
1442
1443         # For now, always pick the highest quality.
1444         rendition = renditions[-1]
1445
1446         try:
1447             _,_,ext = rendition.attrib['type'].partition('/')
1448             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1449             video_url = rendition.find('./src').text
1450         except KeyError:
1451             raise ExtractorError('Invalid rendition field.')
1452
1453         info = {
1454             'id': video_id,
1455             'url': video_url,
1456             'uploader': performer,
1457             'upload_date': None,
1458             'title': video_title,
1459             'ext': ext,
1460             'format': format,
1461         }
1462
1463         return [info]
1464
1465
1466 class YoukuIE(InfoExtractor):
1467     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1468
1469     def _gen_sid(self):
1470         nowTime = int(time.time() * 1000)
1471         random1 = random.randint(1000,1998)
1472         random2 = random.randint(1000,9999)
1473
1474         return "%d%d%d" %(nowTime,random1,random2)
1475
1476     def _get_file_ID_mix_string(self, seed):
1477         mixed = []
1478         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1479         seed = float(seed)
1480         for i in range(len(source)):
1481             seed  =  (seed * 211 + 30031 ) % 65536
1482             index  =  math.floor(seed / 65536 * len(source) )
1483             mixed.append(source[int(index)])
1484             source.remove(source[int(index)])
1485         #return ''.join(mixed)
1486         return mixed
1487
1488     def _get_file_id(self, fileId, seed):
1489         mixed = self._get_file_ID_mix_string(seed)
1490         ids = fileId.split('*')
1491         realId = []
1492         for ch in ids:
1493             if ch:
1494                 realId.append(mixed[int(ch)])
1495         return ''.join(realId)
1496
1497     def _real_extract(self, url):
1498         mobj = re.match(self._VALID_URL, url)
1499         if mobj is None:
1500             raise ExtractorError(u'Invalid URL: %s' % url)
1501         video_id = mobj.group('ID')
1502
1503         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1504
1505         jsondata = self._download_webpage(info_url, video_id)
1506
1507         self.report_extraction(video_id)
1508         try:
1509             config = json.loads(jsondata)
1510
1511             video_title =  config['data'][0]['title']
1512             seed = config['data'][0]['seed']
1513
1514             format = self._downloader.params.get('format', None)
1515             supported_format = list(config['data'][0]['streamfileids'].keys())
1516
1517             if format is None or format == 'best':
1518                 if 'hd2' in supported_format:
1519                     format = 'hd2'
1520                 else:
1521                     format = 'flv'
1522                 ext = u'flv'
1523             elif format == 'worst':
1524                 format = 'mp4'
1525                 ext = u'mp4'
1526             else:
1527                 format = 'flv'
1528                 ext = u'flv'
1529
1530
1531             fileid = config['data'][0]['streamfileids'][format]
1532             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1533         except (UnicodeDecodeError, ValueError, KeyError):
1534             raise ExtractorError(u'Unable to extract info section')
1535
1536         files_info=[]
1537         sid = self._gen_sid()
1538         fileid = self._get_file_id(fileid, seed)
1539
1540         #column 8,9 of fileid represent the segment number
1541         #fileid[7:9] should be changed
1542         for index, key in enumerate(keys):
1543
1544             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1545             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1546
1547             info = {
1548                 'id': '%s_part%02d' % (video_id, index),
1549                 'url': download_url,
1550                 'uploader': None,
1551                 'upload_date': None,
1552                 'title': video_title,
1553                 'ext': ext,
1554             }
1555             files_info.append(info)
1556
1557         return files_info
1558
1559
1560 class XNXXIE(InfoExtractor):
1561     """Information extractor for xnxx.com"""
1562
1563     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1564     IE_NAME = u'xnxx'
1565     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1566     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1567     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1568
1569     def _real_extract(self, url):
1570         mobj = re.match(self._VALID_URL, url)
1571         if mobj is None:
1572             raise ExtractorError(u'Invalid URL: %s' % url)
1573         video_id = mobj.group(1)
1574
1575         # Get webpage content
1576         webpage = self._download_webpage(url, video_id)
1577
1578         video_url = self._search_regex(self.VIDEO_URL_RE,
1579             webpage, u'video URL')
1580         video_url = compat_urllib_parse.unquote(video_url)
1581
1582         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1583             webpage, u'title')
1584
1585         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1586             webpage, u'thumbnail', fatal=False)
1587
1588         return [{
1589             'id': video_id,
1590             'url': video_url,
1591             'uploader': None,
1592             'upload_date': None,
1593             'title': video_title,
1594             'ext': 'flv',
1595             'thumbnail': video_thumbnail,
1596             'description': None,
1597         }]
1598
1599
1600 class GooglePlusIE(InfoExtractor):
1601     """Information extractor for plus.google.com."""
1602
1603     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1604     IE_NAME = u'plus.google'
1605
1606     def _real_extract(self, url):
1607         # Extract id from URL
1608         mobj = re.match(self._VALID_URL, url)
1609         if mobj is None:
1610             raise ExtractorError(u'Invalid URL: %s' % url)
1611
1612         post_url = mobj.group(0)
1613         video_id = mobj.group(1)
1614
1615         video_extension = 'flv'
1616
1617         # Step 1, Retrieve post webpage to extract further information
1618         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1619
1620         self.report_extraction(video_id)
1621
1622         # Extract update date
1623         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1624             webpage, u'upload date', fatal=False)
1625         if upload_date:
1626             # Convert timestring to a format suitable for filename
1627             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1628             upload_date = upload_date.strftime('%Y%m%d')
1629
1630         # Extract uploader
1631         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1632             webpage, u'uploader', fatal=False)
1633
1634         # Extract title
1635         # Get the first line for title
1636         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1637             webpage, 'title', default=u'NA')
1638
1639         # Step 2, Stimulate clicking the image box to launch video
1640         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1641             webpage, u'video page URL')
1642         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1643
1644         # Extract video links on video page
1645         """Extract video links of all sizes"""
1646         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1647         mobj = re.findall(pattern, webpage)
1648         if len(mobj) == 0:
1649             raise ExtractorError(u'Unable to extract video links')
1650
1651         # Sort in resolution
1652         links = sorted(mobj)
1653
1654         # Choose the lowest of the sort, i.e. highest resolution
1655         video_url = links[-1]
1656         # Only get the url. The resolution part in the tuple has no use anymore
1657         video_url = video_url[-1]
1658         # Treat escaped \u0026 style hex
1659         try:
1660             video_url = video_url.decode("unicode_escape")
1661         except AttributeError: # Python 3
1662             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1663
1664
1665         return [{
1666             'id':       video_id,
1667             'url':      video_url,
1668             'uploader': uploader,
1669             'upload_date':  upload_date,
1670             'title':    video_title,
1671             'ext':      video_extension,
1672         }]
1673
1674 class NBAIE(InfoExtractor):
1675     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1676     IE_NAME = u'nba'
1677
1678     def _real_extract(self, url):
1679         mobj = re.match(self._VALID_URL, url)
1680         if mobj is None:
1681             raise ExtractorError(u'Invalid URL: %s' % url)
1682
1683         video_id = mobj.group(1)
1684
1685         webpage = self._download_webpage(url, video_id)
1686
1687         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1688
1689         shortened_video_id = video_id.rpartition('/')[2]
1690         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1691             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1692
1693         # It isn't there in the HTML it returns to us
1694         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1695
1696         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1697
1698         info = {
1699             'id': shortened_video_id,
1700             'url': video_url,
1701             'ext': 'mp4',
1702             'title': title,
1703             # 'uploader_date': uploader_date,
1704             'description': description,
1705         }
1706         return [info]
1707
1708 class JustinTVIE(InfoExtractor):
1709     """Information extractor for justin.tv and twitch.tv"""
1710     # TODO: One broadcast may be split into multiple videos. The key
1711     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1712     # starts at 1 and increases. Can we treat all parts as one video?
1713
1714     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1715         (?:
1716             (?P<channelid>[^/]+)|
1717             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1718             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1719         )
1720         /?(?:\#.*)?$
1721         """
1722     _JUSTIN_PAGE_LIMIT = 100
1723     IE_NAME = u'justin.tv'
1724
1725     def report_download_page(self, channel, offset):
1726         """Report attempt to download a single page of videos."""
1727         self.to_screen(u'%s: Downloading video information from %d to %d' %
1728                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1729
1730     # Return count of items, list of *valid* items
1731     def _parse_page(self, url, video_id):
1732         webpage = self._download_webpage(url, video_id,
1733                                          u'Downloading video info JSON',
1734                                          u'unable to download video info JSON')
1735
1736         response = json.loads(webpage)
1737         if type(response) != list:
1738             error_text = response.get('error', 'unknown error')
1739             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1740         info = []
1741         for clip in response:
1742             video_url = clip['video_file_url']
1743             if video_url:
1744                 video_extension = os.path.splitext(video_url)[1][1:]
1745                 video_date = re.sub('-', '', clip['start_time'][:10])
1746                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1747                 video_id = clip['id']
1748                 video_title = clip.get('title', video_id)
1749                 info.append({
1750                     'id': video_id,
1751                     'url': video_url,
1752                     'title': video_title,
1753                     'uploader': clip.get('channel_name', video_uploader_id),
1754                     'uploader_id': video_uploader_id,
1755                     'upload_date': video_date,
1756                     'ext': video_extension,
1757                 })
1758         return (len(response), info)
1759
1760     def _real_extract(self, url):
1761         mobj = re.match(self._VALID_URL, url)
1762         if mobj is None:
1763             raise ExtractorError(u'invalid URL: %s' % url)
1764
1765         api_base = 'http://api.justin.tv'
1766         paged = False
1767         if mobj.group('channelid'):
1768             paged = True
1769             video_id = mobj.group('channelid')
1770             api = api_base + '/channel/archives/%s.json' % video_id
1771         elif mobj.group('chapterid'):
1772             chapter_id = mobj.group('chapterid')
1773
1774             webpage = self._download_webpage(url, chapter_id)
1775             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1776             if not m:
1777                 raise ExtractorError(u'Cannot find archive of a chapter')
1778             archive_id = m.group(1)
1779
1780             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1781             chapter_info_xml = self._download_webpage(api, chapter_id,
1782                                              note=u'Downloading chapter information',
1783                                              errnote=u'Chapter information download failed')
1784             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1785             for a in doc.findall('.//archive'):
1786                 if archive_id == a.find('./id').text:
1787                     break
1788             else:
1789                 raise ExtractorError(u'Could not find chapter in chapter information')
1790
1791             video_url = a.find('./video_file_url').text
1792             video_ext = video_url.rpartition('.')[2] or u'flv'
1793
1794             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1795             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1796                                    note='Downloading chapter metadata',
1797                                    errnote='Download of chapter metadata failed')
1798             chapter_info = json.loads(chapter_info_json)
1799
1800             bracket_start = int(doc.find('.//bracket_start').text)
1801             bracket_end = int(doc.find('.//bracket_end').text)
1802
1803             # TODO determine start (and probably fix up file)
1804             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1805             #video_url += u'?start=' + TODO:start_timestamp
1806             # bracket_start is 13290, but we want 51670615
1807             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1808                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1809
1810             info = {
1811                 'id': u'c' + chapter_id,
1812                 'url': video_url,
1813                 'ext': video_ext,
1814                 'title': chapter_info['title'],
1815                 'thumbnail': chapter_info['preview'],
1816                 'description': chapter_info['description'],
1817                 'uploader': chapter_info['channel']['display_name'],
1818                 'uploader_id': chapter_info['channel']['name'],
1819             }
1820             return [info]
1821         else:
1822             video_id = mobj.group('videoid')
1823             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1824
1825         self.report_extraction(video_id)
1826
1827         info = []
1828         offset = 0
1829         limit = self._JUSTIN_PAGE_LIMIT
1830         while True:
1831             if paged:
1832                 self.report_download_page(video_id, offset)
1833             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1834             page_count, page_info = self._parse_page(page_url, video_id)
1835             info.extend(page_info)
1836             if not paged or page_count != limit:
1837                 break
1838             offset += limit
1839         return info
1840
1841 class FunnyOrDieIE(InfoExtractor):
1842     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1843
1844     def _real_extract(self, url):
1845         mobj = re.match(self._VALID_URL, url)
1846         if mobj is None:
1847             raise ExtractorError(u'invalid URL: %s' % url)
1848
1849         video_id = mobj.group('id')
1850         webpage = self._download_webpage(url, video_id)
1851
1852         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1853             webpage, u'video URL', flags=re.DOTALL)
1854
1855         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1856             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1857
1858         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1859             webpage, u'description', fatal=False, flags=re.DOTALL)
1860
1861         info = {
1862             'id': video_id,
1863             'url': video_url,
1864             'ext': 'mp4',
1865             'title': title,
1866             'description': video_description,
1867         }
1868         return [info]
1869
1870 class SteamIE(InfoExtractor):
1871     _VALID_URL = r"""http://store\.steampowered\.com/
1872                 (agecheck/)?
1873                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1874                 (?P<gameID>\d+)/?
1875                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1876                 """
1877     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1878     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1879
1880     @classmethod
1881     def suitable(cls, url):
1882         """Receives a URL and returns True if suitable for this IE."""
1883         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1884
1885     def _real_extract(self, url):
1886         m = re.match(self._VALID_URL, url, re.VERBOSE)
1887         gameID = m.group('gameID')
1888
1889         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1890         webpage = self._download_webpage(videourl, gameID)
1891
1892         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1893             videourl = self._AGECHECK_TEMPLATE % gameID
1894             self.report_age_confirmation()
1895             webpage = self._download_webpage(videourl, gameID)
1896
1897         self.report_extraction(gameID)
1898         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1899                                              webpage, 'game title')
1900
1901         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1902         mweb = re.finditer(urlRE, webpage)
1903         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1904         titles = re.finditer(namesRE, webpage)
1905         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1906         thumbs = re.finditer(thumbsRE, webpage)
1907         videos = []
1908         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1909             video_id = vid.group('videoID')
1910             title = vtitle.group('videoName')
1911             video_url = vid.group('videoURL')
1912             video_thumb = thumb.group('thumbnail')
1913             if not video_url:
1914                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1915             info = {
1916                 'id':video_id,
1917                 'url':video_url,
1918                 'ext': 'flv',
1919                 'title': unescapeHTML(title),
1920                 'thumbnail': video_thumb
1921                   }
1922             videos.append(info)
1923         return [self.playlist_result(videos, gameID, game_title)]
1924
1925 class UstreamIE(InfoExtractor):
1926     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1927     IE_NAME = u'ustream'
1928
1929     def _real_extract(self, url):
1930         m = re.match(self._VALID_URL, url)
1931         video_id = m.group('videoID')
1932
1933         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1934         webpage = self._download_webpage(url, video_id)
1935
1936         self.report_extraction(video_id)
1937
1938         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1939             webpage, u'title')
1940
1941         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1942             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1943
1944         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1945             webpage, u'thumbnail', fatal=False)
1946
1947         info = {
1948                 'id': video_id,
1949                 'url': video_url,
1950                 'ext': 'flv',
1951                 'title': video_title,
1952                 'uploader': uploader,
1953                 'thumbnail': thumbnail,
1954                }
1955         return info
1956
1957 class WorldStarHipHopIE(InfoExtractor):
1958     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1959     IE_NAME = u'WorldStarHipHop'
1960
1961     def _real_extract(self, url):
1962         m = re.match(self._VALID_URL, url)
1963         video_id = m.group('id')
1964
1965         webpage_src = self._download_webpage(url, video_id)
1966
1967         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1968             webpage_src, u'video URL')
1969
1970         if 'mp4' in video_url:
1971             ext = 'mp4'
1972         else:
1973             ext = 'flv'
1974
1975         video_title = self._html_search_regex(r"<title>(.*)</title>",
1976             webpage_src, u'title')
1977
1978         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1979         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1980             webpage_src, u'thumbnail', fatal=False)
1981
1982         if not thumbnail:
1983             _title = r"""candytitles.*>(.*)</span>"""
1984             mobj = re.search(_title, webpage_src)
1985             if mobj is not None:
1986                 video_title = mobj.group(1)
1987
1988         results = [{
1989                     'id': video_id,
1990                     'url' : video_url,
1991                     'title' : video_title,
1992                     'thumbnail' : thumbnail,
1993                     'ext' : ext,
1994                     }]
1995         return results
1996
1997 class RBMARadioIE(InfoExtractor):
1998     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1999
2000     def _real_extract(self, url):
2001         m = re.match(self._VALID_URL, url)
2002         video_id = m.group('videoID')
2003
2004         webpage = self._download_webpage(url, video_id)
2005
2006         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2007             webpage, u'json data', flags=re.MULTILINE)
2008
2009         try:
2010             data = json.loads(json_data)
2011         except ValueError as e:
2012             raise ExtractorError(u'Invalid JSON: ' + str(e))
2013
2014         video_url = data['akamai_url'] + '&cbr=256'
2015         url_parts = compat_urllib_parse_urlparse(video_url)
2016         video_ext = url_parts.path.rpartition('.')[2]
2017         info = {
2018                 'id': video_id,
2019                 'url': video_url,
2020                 'ext': video_ext,
2021                 'title': data['title'],
2022                 'description': data.get('teaser_text'),
2023                 'location': data.get('country_of_origin'),
2024                 'uploader': data.get('host', {}).get('name'),
2025                 'uploader_id': data.get('host', {}).get('slug'),
2026                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2027                 'duration': data.get('duration'),
2028         }
2029         return [info]
2030
2031
2032 class YouPornIE(InfoExtractor):
2033     """Information extractor for youporn.com."""
2034     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2035
2036     def _print_formats(self, formats):
2037         """Print all available formats"""
2038         print(u'Available formats:')
2039         print(u'ext\t\tformat')
2040         print(u'---------------------------------')
2041         for format in formats:
2042             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2043
2044     def _specific(self, req_format, formats):
2045         for x in formats:
2046             if(x["format"]==req_format):
2047                 return x
2048         return None
2049
2050     def _real_extract(self, url):
2051         mobj = re.match(self._VALID_URL, url)
2052         if mobj is None:
2053             raise ExtractorError(u'Invalid URL: %s' % url)
2054         video_id = mobj.group('videoid')
2055
2056         req = compat_urllib_request.Request(url)
2057         req.add_header('Cookie', 'age_verified=1')
2058         webpage = self._download_webpage(req, video_id)
2059
2060         # Get JSON parameters
2061         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2062         try:
2063             params = json.loads(json_params)
2064         except:
2065             raise ExtractorError(u'Invalid JSON')
2066
2067         self.report_extraction(video_id)
2068         try:
2069             video_title = params['title']
2070             upload_date = unified_strdate(params['release_date_f'])
2071             video_description = params['description']
2072             video_uploader = params['submitted_by']
2073             thumbnail = params['thumbnails'][0]['image']
2074         except KeyError:
2075             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2076
2077         # Get all of the formats available
2078         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2079         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2080             webpage, u'download list').strip()
2081
2082         # Get all of the links from the page
2083         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2084         links = re.findall(LINK_RE, download_list_html)
2085         if(len(links) == 0):
2086             raise ExtractorError(u'ERROR: no known formats available for video')
2087
2088         self.to_screen(u'Links found: %d' % len(links))
2089
2090         formats = []
2091         for link in links:
2092
2093             # A link looks like this:
2094             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2095             # A path looks like this:
2096             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2097             video_url = unescapeHTML( link )
2098             path = compat_urllib_parse_urlparse( video_url ).path
2099             extension = os.path.splitext( path )[1][1:]
2100             format = path.split('/')[4].split('_')[:2]
2101             size = format[0]
2102             bitrate = format[1]
2103             format = "-".join( format )
2104             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2105
2106             formats.append({
2107                 'id': video_id,
2108                 'url': video_url,
2109                 'uploader': video_uploader,
2110                 'upload_date': upload_date,
2111                 'title': video_title,
2112                 'ext': extension,
2113                 'format': format,
2114                 'thumbnail': thumbnail,
2115                 'description': video_description
2116             })
2117
2118         if self._downloader.params.get('listformats', None):
2119             self._print_formats(formats)
2120             return
2121
2122         req_format = self._downloader.params.get('format', None)
2123         self.to_screen(u'Format: %s' % req_format)
2124
2125         if req_format is None or req_format == 'best':
2126             return [formats[0]]
2127         elif req_format == 'worst':
2128             return [formats[-1]]
2129         elif req_format in ('-1', 'all'):
2130             return formats
2131         else:
2132             format = self._specific( req_format, formats )
2133             if result is None:
2134                 raise ExtractorError(u'Requested format not available')
2135             return [format]
2136
2137
2138
2139 class PornotubeIE(InfoExtractor):
2140     """Information extractor for pornotube.com."""
2141     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2142
2143     def _real_extract(self, url):
2144         mobj = re.match(self._VALID_URL, url)
2145         if mobj is None:
2146             raise ExtractorError(u'Invalid URL: %s' % url)
2147
2148         video_id = mobj.group('videoid')
2149         video_title = mobj.group('title')
2150
2151         # Get webpage content
2152         webpage = self._download_webpage(url, video_id)
2153
2154         # Get the video URL
2155         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2156         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2157         video_url = compat_urllib_parse.unquote(video_url)
2158
2159         #Get the uploaded date
2160         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2161         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2162         if upload_date: upload_date = unified_strdate(upload_date)
2163
2164         info = {'id': video_id,
2165                 'url': video_url,
2166                 'uploader': None,
2167                 'upload_date': upload_date,
2168                 'title': video_title,
2169                 'ext': 'flv',
2170                 'format': 'flv'}
2171
2172         return [info]
2173
2174 class YouJizzIE(InfoExtractor):
2175     """Information extractor for youjizz.com."""
2176     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2177
2178     def _real_extract(self, url):
2179         mobj = re.match(self._VALID_URL, url)
2180         if mobj is None:
2181             raise ExtractorError(u'Invalid URL: %s' % url)
2182
2183         video_id = mobj.group('videoid')
2184
2185         # Get webpage content
2186         webpage = self._download_webpage(url, video_id)
2187
2188         # Get the video title
2189         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2190             webpage, u'title').strip()
2191
2192         # Get the embed page
2193         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2194         if result is None:
2195             raise ExtractorError(u'ERROR: unable to extract embed page')
2196
2197         embed_page_url = result.group(0).strip()
2198         video_id = result.group('videoid')
2199
2200         webpage = self._download_webpage(embed_page_url, video_id)
2201
2202         # Get the video URL
2203         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2204             webpage, u'video URL')
2205
2206         info = {'id': video_id,
2207                 'url': video_url,
2208                 'title': video_title,
2209                 'ext': 'flv',
2210                 'format': 'flv',
2211                 'player_url': embed_page_url}
2212
2213         return [info]
2214
2215 class EightTracksIE(InfoExtractor):
2216     IE_NAME = '8tracks'
2217     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2218
2219     def _real_extract(self, url):
2220         mobj = re.match(self._VALID_URL, url)
2221         if mobj is None:
2222             raise ExtractorError(u'Invalid URL: %s' % url)
2223         playlist_id = mobj.group('id')
2224
2225         webpage = self._download_webpage(url, playlist_id)
2226
2227         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2228         data = json.loads(json_like)
2229
2230         session = str(random.randint(0, 1000000000))
2231         mix_id = data['id']
2232         track_count = data['tracks_count']
2233         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2234         next_url = first_url
2235         res = []
2236         for i in itertools.count():
2237             api_json = self._download_webpage(next_url, playlist_id,
2238                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2239                 errnote=u'Failed to download song information')
2240             api_data = json.loads(api_json)
2241             track_data = api_data[u'set']['track']
2242             info = {
2243                 'id': track_data['id'],
2244                 'url': track_data['track_file_stream_url'],
2245                 'title': track_data['performer'] + u' - ' + track_data['name'],
2246                 'raw_title': track_data['name'],
2247                 'uploader_id': data['user']['login'],
2248                 'ext': 'm4a',
2249             }
2250             res.append(info)
2251             if api_data['set']['at_last_track']:
2252                 break
2253             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2254         return res
2255
2256 class KeekIE(InfoExtractor):
2257     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2258     IE_NAME = u'keek'
2259
2260     def _real_extract(self, url):
2261         m = re.match(self._VALID_URL, url)
2262         video_id = m.group('videoID')
2263
2264         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2265         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2266         webpage = self._download_webpage(url, video_id)
2267
2268         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2269             webpage, u'title')
2270
2271         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2272             webpage, u'uploader', fatal=False)
2273
2274         info = {
2275                 'id': video_id,
2276                 'url': video_url,
2277                 'ext': 'mp4',
2278                 'title': video_title,
2279                 'thumbnail': thumbnail,
2280                 'uploader': uploader
2281         }
2282         return [info]
2283
2284 class TEDIE(InfoExtractor):
2285     _VALID_URL=r'''http://www\.ted\.com/
2286                    (
2287                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2288                         |
2289                         ((?P<type_talk>talks)) # We have a simple talk
2290                    )
2291                    (/lang/(.*?))? # The url may contain the language
2292                    /(?P<name>\w+) # Here goes the name and then ".html"
2293                    '''
2294
2295     @classmethod
2296     def suitable(cls, url):
2297         """Receives a URL and returns True if suitable for this IE."""
2298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2299
2300     def _real_extract(self, url):
2301         m=re.match(self._VALID_URL, url, re.VERBOSE)
2302         if m.group('type_talk'):
2303             return [self._talk_info(url)]
2304         else :
2305             playlist_id=m.group('playlist_id')
2306             name=m.group('name')
2307             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2308             return [self._playlist_videos_info(url,name,playlist_id)]
2309
2310     def _playlist_videos_info(self,url,name,playlist_id=0):
2311         '''Returns the videos of the playlist'''
2312         video_RE=r'''
2313                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2314                      ([.\s]*?)data-playlist_item_id="(\d+)"
2315                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2316                      '''
2317         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2318         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2319         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2320         m_names=re.finditer(video_name_RE,webpage)
2321
2322         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2323                                                  webpage, 'playlist title')
2324
2325         playlist_entries = []
2326         for m_video, m_name in zip(m_videos,m_names):
2327             video_id=m_video.group('video_id')
2328             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2329             playlist_entries.append(self.url_result(talk_url, 'TED'))
2330         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2331
2332     def _talk_info(self, url, video_id=0):
2333         """Return the video for the talk in the url"""
2334         m = re.match(self._VALID_URL, url,re.VERBOSE)
2335         video_name = m.group('name')
2336         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2337         self.report_extraction(video_name)
2338         # If the url includes the language we get the title translated
2339         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2340                                         webpage, 'title')
2341         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2342                                     webpage, 'json data')
2343         info = json.loads(json_data)
2344         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2345                                        webpage, 'description', flags = re.DOTALL)
2346         
2347         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2348                                        webpage, 'thumbnail')
2349         info = {
2350                 'id': info['id'],
2351                 'url': info['htmlStreams'][-1]['file'],
2352                 'ext': 'mp4',
2353                 'title': title,
2354                 'thumbnail': thumbnail,
2355                 'description': desc,
2356                 }
2357         return info
2358
2359 class MySpassIE(InfoExtractor):
2360     _VALID_URL = r'http://www.myspass.de/.*'
2361
2362     def _real_extract(self, url):
2363         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2364
2365         # video id is the last path element of the URL
2366         # usually there is a trailing slash, so also try the second but last
2367         url_path = compat_urllib_parse_urlparse(url).path
2368         url_parent_path, video_id = os.path.split(url_path)
2369         if not video_id:
2370             _, video_id = os.path.split(url_parent_path)
2371
2372         # get metadata
2373         metadata_url = META_DATA_URL_TEMPLATE % video_id
2374         metadata_text = self._download_webpage(metadata_url, video_id)
2375         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2376
2377         # extract values from metadata
2378         url_flv_el = metadata.find('url_flv')
2379         if url_flv_el is None:
2380             raise ExtractorError(u'Unable to extract download url')
2381         video_url = url_flv_el.text
2382         extension = os.path.splitext(video_url)[1][1:]
2383         title_el = metadata.find('title')
2384         if title_el is None:
2385             raise ExtractorError(u'Unable to extract title')
2386         title = title_el.text
2387         format_id_el = metadata.find('format_id')
2388         if format_id_el is None:
2389             format = ext
2390         else:
2391             format = format_id_el.text
2392         description_el = metadata.find('description')
2393         if description_el is not None:
2394             description = description_el.text
2395         else:
2396             description = None
2397         imagePreview_el = metadata.find('imagePreview')
2398         if imagePreview_el is not None:
2399             thumbnail = imagePreview_el.text
2400         else:
2401             thumbnail = None
2402         info = {
2403             'id': video_id,
2404             'url': video_url,
2405             'title': title,
2406             'ext': extension,
2407             'format': format,
2408             'thumbnail': thumbnail,
2409             'description': description
2410         }
2411         return [info]
2412
2413 class SpiegelIE(InfoExtractor):
2414     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2415
2416     def _real_extract(self, url):
2417         m = re.match(self._VALID_URL, url)
2418         video_id = m.group('videoID')
2419
2420         webpage = self._download_webpage(url, video_id)
2421
2422         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2423             webpage, u'title')
2424
2425         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2426         xml_code = self._download_webpage(xml_url, video_id,
2427                     note=u'Downloading XML', errnote=u'Failed to download XML')
2428
2429         idoc = xml.etree.ElementTree.fromstring(xml_code)
2430         last_type = idoc[-1]
2431         filename = last_type.findall('./filename')[0].text
2432         duration = float(last_type.findall('./duration')[0].text)
2433
2434         video_url = 'http://video2.spiegel.de/flash/' + filename
2435         video_ext = filename.rpartition('.')[2]
2436         info = {
2437             'id': video_id,
2438             'url': video_url,
2439             'ext': video_ext,
2440             'title': video_title,
2441             'duration': duration,
2442         }
2443         return [info]
2444
2445 class LiveLeakIE(InfoExtractor):
2446
2447     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2448     IE_NAME = u'liveleak'
2449
2450     def _real_extract(self, url):
2451         mobj = re.match(self._VALID_URL, url)
2452         if mobj is None:
2453             raise ExtractorError(u'Invalid URL: %s' % url)
2454
2455         video_id = mobj.group('video_id')
2456
2457         webpage = self._download_webpage(url, video_id)
2458
2459         video_url = self._search_regex(r'file: "(.*?)",',
2460             webpage, u'video URL')
2461
2462         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2463             webpage, u'title').replace('LiveLeak.com -', '').strip()
2464
2465         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2466             webpage, u'description', fatal=False)
2467
2468         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2469             webpage, u'uploader', fatal=False)
2470
2471         info = {
2472             'id':  video_id,
2473             'url': video_url,
2474             'ext': 'mp4',
2475             'title': video_title,
2476             'description': video_description,
2477             'uploader': video_uploader
2478         }
2479
2480         return [info]
2481
2482
2483
2484 class TumblrIE(InfoExtractor):
2485     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2486
2487     def _real_extract(self, url):
2488         m_url = re.match(self._VALID_URL, url)
2489         video_id = m_url.group('id')
2490         blog = m_url.group('blog_name')
2491
2492         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2493         webpage = self._download_webpage(url, video_id)
2494
2495         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2496         video = re.search(re_video, webpage)
2497         if video is None:
2498            raise ExtractorError(u'Unable to extract video')
2499         video_url = video.group('video_url')
2500         ext = video.group('ext')
2501
2502         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2503             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2504         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2505
2506         # The only place where you can get a title, it's not complete,
2507         # but searching in other places doesn't work for all videos
2508         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2509             webpage, u'title', flags=re.DOTALL)
2510
2511         return [{'id': video_id,
2512                  'url': video_url,
2513                  'title': video_title,
2514                  'thumbnail': video_thumbnail,
2515                  'ext': ext
2516                  }]
2517
2518 class BandcampIE(InfoExtractor):
2519     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2520
2521     def _real_extract(self, url):
2522         mobj = re.match(self._VALID_URL, url)
2523         title = mobj.group('title')
2524         webpage = self._download_webpage(url, title)
2525         # We get the link to the free download page
2526         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2527         if m_download is None:
2528             raise ExtractorError(u'No free songs found')
2529
2530         download_link = m_download.group(1)
2531         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
2532                        webpage, re.MULTILINE|re.DOTALL).group('id')
2533
2534         download_webpage = self._download_webpage(download_link, id,
2535                                                   'Downloading free downloads page')
2536         # We get the dictionary of the track from some javascrip code
2537         info = re.search(r'items: (.*?),$',
2538                          download_webpage, re.MULTILINE).group(1)
2539         info = json.loads(info)[0]
2540         # We pick mp3-320 for now, until format selection can be easily implemented.
2541         mp3_info = info[u'downloads'][u'mp3-320']
2542         # If we try to use this url it says the link has expired
2543         initial_url = mp3_info[u'url']
2544         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2545         m_url = re.match(re_url, initial_url)
2546         #We build the url we will use to get the final track url
2547         # This url is build in Bandcamp in the script download_bunde_*.js
2548         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2549         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2550         # If we could correctly generate the .rand field the url would be
2551         #in the "download_url" key
2552         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2553
2554         track_info = {'id':id,
2555                       'title' : info[u'title'],
2556                       'ext' :   'mp3',
2557                       'url' :   final_url,
2558                       'thumbnail' : info[u'thumb_url'],
2559                       'uploader' :  info[u'artist']
2560                       }
2561
2562         return [track_info]
2563
2564 class RedTubeIE(InfoExtractor):
2565     """Information Extractor for redtube"""
2566     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2567
2568     def _real_extract(self,url):
2569         mobj = re.match(self._VALID_URL, url)
2570         if mobj is None:
2571             raise ExtractorError(u'Invalid URL: %s' % url)
2572
2573         video_id = mobj.group('id')
2574         video_extension = 'mp4'        
2575         webpage = self._download_webpage(url, video_id)
2576
2577         self.report_extraction(video_id)
2578
2579         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2580             webpage, u'video URL')
2581
2582         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2583             webpage, u'title')
2584
2585         return [{
2586             'id':       video_id,
2587             'url':      video_url,
2588             'ext':      video_extension,
2589             'title':    video_title,
2590         }]
2591         
2592 class InaIE(InfoExtractor):
2593     """Information Extractor for Ina.fr"""
2594     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2595
2596     def _real_extract(self,url):
2597         mobj = re.match(self._VALID_URL, url)
2598
2599         video_id = mobj.group('id')
2600         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2601         video_extension = 'mp4'
2602         webpage = self._download_webpage(mrss_url, video_id)
2603
2604         self.report_extraction(video_id)
2605
2606         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2607             webpage, u'video URL')
2608
2609         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2610             webpage, u'title')
2611
2612         return [{
2613             'id':       video_id,
2614             'url':      video_url,
2615             'ext':      video_extension,
2616             'title':    video_title,
2617         }]
2618
2619 class HowcastIE(InfoExtractor):
2620     """Information Extractor for Howcast.com"""
2621     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2622
2623     def _real_extract(self, url):
2624         mobj = re.match(self._VALID_URL, url)
2625
2626         video_id = mobj.group('id')
2627         webpage_url = 'http://www.howcast.com/videos/' + video_id
2628         webpage = self._download_webpage(webpage_url, video_id)
2629
2630         self.report_extraction(video_id)
2631
2632         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2633             webpage, u'video URL')
2634
2635         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2636             webpage, u'title')
2637
2638         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2639             webpage, u'description', fatal=False)
2640
2641         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2642             webpage, u'thumbnail', fatal=False)
2643
2644         return [{
2645             'id':       video_id,
2646             'url':      video_url,
2647             'ext':      'mp4',
2648             'title':    video_title,
2649             'description': video_description,
2650             'thumbnail': thumbnail,
2651         }]
2652
2653 class VineIE(InfoExtractor):
2654     """Information Extractor for Vine.co"""
2655     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2656
2657     def _real_extract(self, url):
2658         mobj = re.match(self._VALID_URL, url)
2659
2660         video_id = mobj.group('id')
2661         webpage_url = 'https://vine.co/v/' + video_id
2662         webpage = self._download_webpage(webpage_url, video_id)
2663
2664         self.report_extraction(video_id)
2665
2666         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2667             webpage, u'video URL')
2668
2669         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2670             webpage, u'title')
2671
2672         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2673             webpage, u'thumbnail', fatal=False)
2674
2675         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2676             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2677
2678         return [{
2679             'id':        video_id,
2680             'url':       video_url,
2681             'ext':       'mp4',
2682             'title':     video_title,
2683             'thumbnail': thumbnail,
2684             'uploader':  uploader,
2685         }]
2686
2687 class FlickrIE(InfoExtractor):
2688     """Information Extractor for Flickr videos"""
2689     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2690
2691     def _real_extract(self, url):
2692         mobj = re.match(self._VALID_URL, url)
2693
2694         video_id = mobj.group('id')
2695         video_uploader_id = mobj.group('uploader_id')
2696         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2697         webpage = self._download_webpage(webpage_url, video_id)
2698
2699         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2700
2701         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2702         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2703
2704         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2705             first_xml, u'node_id')
2706
2707         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2708         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2709
2710         self.report_extraction(video_id)
2711
2712         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2713         if mobj is None:
2714             raise ExtractorError(u'Unable to extract video url')
2715         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2716
2717         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2718             webpage, u'video title')
2719
2720         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2721             webpage, u'description', fatal=False)
2722
2723         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2724             webpage, u'thumbnail', fatal=False)
2725
2726         return [{
2727             'id':          video_id,
2728             'url':         video_url,
2729             'ext':         'mp4',
2730             'title':       video_title,
2731             'description': video_description,
2732             'thumbnail':   thumbnail,
2733             'uploader_id': video_uploader_id,
2734         }]
2735
2736 class TeamcocoIE(InfoExtractor):
2737     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2738
2739     def _real_extract(self, url):
2740         mobj = re.match(self._VALID_URL, url)
2741         if mobj is None:
2742             raise ExtractorError(u'Invalid URL: %s' % url)
2743         url_title = mobj.group('url_title')
2744         webpage = self._download_webpage(url, url_title)
2745
2746         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2747             webpage, u'video id')
2748
2749         self.report_extraction(video_id)
2750
2751         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2752             webpage, u'title')
2753
2754         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2755             webpage, u'thumbnail', fatal=False)
2756
2757         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2758             webpage, u'description', fatal=False)
2759
2760         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2761         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2762
2763         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2764             data, u'video URL')
2765
2766         return [{
2767             'id':          video_id,
2768             'url':         video_url,
2769             'ext':         'mp4',
2770             'title':       video_title,
2771             'thumbnail':   thumbnail,
2772             'description': video_description,
2773         }]
2774
2775 class XHamsterIE(InfoExtractor):
2776     """Information Extractor for xHamster"""
2777     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2778
2779     def _real_extract(self,url):
2780         mobj = re.match(self._VALID_URL, url)
2781
2782         video_id = mobj.group('id')
2783         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2784         webpage = self._download_webpage(mrss_url, video_id)
2785
2786         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2787         if mobj is None:
2788             raise ExtractorError(u'Unable to extract media URL')
2789         if len(mobj.group('server')) == 0:
2790             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2791         else:
2792             video_url = mobj.group('server')+'/key='+mobj.group('file')
2793         video_extension = video_url.split('.')[-1]
2794
2795         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2796             webpage, u'title')
2797
2798         # Can't see the description anywhere in the UI
2799         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2800         #     webpage, u'description', fatal=False)
2801         # if video_description: video_description = unescapeHTML(video_description)
2802
2803         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2804         if mobj:
2805             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2806         else:
2807             video_upload_date = None
2808             self._downloader.report_warning(u'Unable to extract upload date')
2809
2810         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2811             webpage, u'uploader id', default=u'anonymous')
2812
2813         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2814             webpage, u'thumbnail', fatal=False)
2815
2816         return [{
2817             'id':       video_id,
2818             'url':      video_url,
2819             'ext':      video_extension,
2820             'title':    video_title,
2821             # 'description': video_description,
2822             'upload_date': video_upload_date,
2823             'uploader_id': video_uploader_id,
2824             'thumbnail': video_thumbnail
2825         }]
2826
2827 class HypemIE(InfoExtractor):
2828     """Information Extractor for hypem"""
2829     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2830
2831     def _real_extract(self, url):
2832         mobj = re.match(self._VALID_URL, url)
2833         if mobj is None:
2834             raise ExtractorError(u'Invalid URL: %s' % url)
2835         track_id = mobj.group(1)
2836
2837         data = { 'ax': 1, 'ts': time.time() }
2838         data_encoded = compat_urllib_parse.urlencode(data)
2839         complete_url = url + "?" + data_encoded
2840         request = compat_urllib_request.Request(complete_url)
2841         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2842         cookie = urlh.headers.get('Set-Cookie', '')
2843
2844         self.report_extraction(track_id)
2845
2846         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2847             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2848         try:
2849             track_list = json.loads(html_tracks)
2850             track = track_list[u'tracks'][0]
2851         except ValueError:
2852             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2853
2854         key = track[u"key"]
2855         track_id = track[u"id"]
2856         artist = track[u"artist"]
2857         title = track[u"song"]
2858
2859         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2860         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2861         request.add_header('cookie', cookie)
2862         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2863         try:
2864             song_data = json.loads(song_data_json)
2865         except ValueError:
2866             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2867         final_url = song_data[u"url"]
2868
2869         return [{
2870             'id':       track_id,
2871             'url':      final_url,
2872             'ext':      "mp3",
2873             'title':    title,
2874             'artist':   artist,
2875         }]
2876
2877 class Vbox7IE(InfoExtractor):
2878     """Information Extractor for Vbox7"""
2879     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2880
2881     def _real_extract(self,url):
2882         mobj = re.match(self._VALID_URL, url)
2883         if mobj is None:
2884             raise ExtractorError(u'Invalid URL: %s' % url)
2885         video_id = mobj.group(1)
2886
2887         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2888         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2889         redirect_url = urlh.geturl() + new_location
2890         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2891
2892         title = self._html_search_regex(r'<title>(.*)</title>',
2893             webpage, u'title').split('/')[0].strip()
2894
2895         ext = "flv"
2896         info_url = "http://vbox7.com/play/magare.do"
2897         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2898         info_request = compat_urllib_request.Request(info_url, data)
2899         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2900         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2901         if info_response is None:
2902             raise ExtractorError(u'Unable to extract the media url')
2903         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2904
2905         return [{
2906             'id':        video_id,
2907             'url':       final_url,
2908             'ext':       ext,
2909             'title':     title,
2910             'thumbnail': thumbnail_url,
2911         }]
2912
2913 class GametrailersIE(InfoExtractor):
2914     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
2915
2916     def _real_extract(self, url):
2917         mobj = re.match(self._VALID_URL, url)
2918         if mobj is None:
2919             raise ExtractorError(u'Invalid URL: %s' % url)
2920         video_id = mobj.group('id')
2921         video_type = mobj.group('type')
2922         webpage = self._download_webpage(url, video_id)
2923         if video_type == 'full-episodes':
2924             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
2925         else:
2926             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
2927         mgid = self._search_regex(mgid_re, webpage, u'mgid')
2928         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
2929
2930         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
2931                                            video_id, u'Downloading video info')
2932         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
2933                                                video_id, u'Downloading video urls info')
2934
2935         self.report_extraction(video_id)
2936         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
2937                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
2938                       <image>.*
2939                         <url>(?P<thumb>.*?)</url>.*
2940                       </image>'''
2941
2942         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
2943         if m_info is None:
2944             raise ExtractorError(u'Unable to extract video info')
2945         video_title = m_info.group('title')
2946         video_description = m_info.group('description')
2947         video_thumb = m_info.group('thumb')
2948
2949         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
2950         if m_urls is None or len(m_urls) == 0:
2951             raise ExtractError(u'Unable to extrat video url')
2952         # They are sorted from worst to best quality
2953         video_url = m_urls[-1].group('url')
2954
2955         return {'url':         video_url,
2956                 'id':          video_id,
2957                 'title':       video_title,
2958                 # Videos are actually flv not mp4
2959                 'ext':         'flv',
2960                 'thumbnail':   video_thumb,
2961                 'description': video_description,
2962                 }
2963
2964 def gen_extractors():
2965     """ Return a list of an instance of every supported extractor.
2966     The order does matter; the first extractor matched is the one handling the URL.
2967     """
2968     return [
2969         YoutubePlaylistIE(),
2970         YoutubeChannelIE(),
2971         YoutubeUserIE(),
2972         YoutubeSearchIE(),
2973         YoutubeIE(),
2974         MetacafeIE(),
2975         DailymotionIE(),
2976         GoogleSearchIE(),
2977         PhotobucketIE(),
2978         YahooIE(),
2979         YahooSearchIE(),
2980         DepositFilesIE(),
2981         FacebookIE(),
2982         BlipTVIE(),
2983         BlipTVUserIE(),
2984         VimeoIE(),
2985         MyVideoIE(),
2986         ComedyCentralIE(),
2987         EscapistIE(),
2988         CollegeHumorIE(),
2989         XVideosIE(),
2990         SoundcloudSetIE(),
2991         SoundcloudIE(),
2992         InfoQIE(),
2993         MixcloudIE(),
2994         StanfordOpenClassroomIE(),
2995         MTVIE(),
2996         YoukuIE(),
2997         XNXXIE(),
2998         YouJizzIE(),
2999         PornotubeIE(),
3000         YouPornIE(),
3001         GooglePlusIE(),
3002         ArteTvIE(),
3003         NBAIE(),
3004         WorldStarHipHopIE(),
3005         JustinTVIE(),
3006         FunnyOrDieIE(),
3007         SteamIE(),
3008         UstreamIE(),
3009         RBMARadioIE(),
3010         EightTracksIE(),
3011         KeekIE(),
3012         TEDIE(),
3013         MySpassIE(),
3014         SpiegelIE(),
3015         LiveLeakIE(),
3016         ARDIE(),
3017         ZDFIE(),
3018         TumblrIE(),
3019         BandcampIE(),
3020         RedTubeIE(),
3021         InaIE(),
3022         HowcastIE(),
3023         VineIE(),
3024         FlickrIE(),
3025         TeamcocoIE(),
3026         XHamsterIE(),
3027         HypemIE(),
3028         Vbox7IE(),
3029         GametrailersIE(),
3030         StatigramIE(),
3031         GenericIE()
3032     ]
3033
3034 def get_info_extractor(ie_name):
3035     """Returns the info extractor class with the given ie_name"""
3036     return globals()[ie_name+'IE']