Use _download_webpage in MetacafeIE
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self._downloader.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146         
147     #Methods for following #608
148     #They set the correct value of the '_type' key
149     def video_result(self, video_info):
150         """Returns a video"""
151         video_info['_type'] = 'video'
152         return video_info
153     def url_result(self, url, ie=None):
154         """Returns a url that points to a page that should be processed"""
155         #TODO: ie should be the class used for getting the info
156         video_info = {'_type': 'url',
157                       'url': url}
158         return video_info
159     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
160         """Returns a playlist"""
161         video_info = {'_type': 'playlist',
162                       'entries': entries}
163         if playlist_id:
164             video_info['id'] = playlist_id
165         if playlist_title:
166             video_info['title'] = playlist_title
167         return video_info
168
169
170 class YoutubeIE(InfoExtractor):
171     """Information extractor for youtube.com."""
172
173     _VALID_URL = r"""^
174                      (
175                          (?:https?://)?                                       # http(s):// (optional)
176                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
177                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
178                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
179                          (?:                                                  # the various things that can precede the ID:
180                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
181                              |(?:                                             # or the v= param in all its forms
182                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
183                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
184                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
185                                  v=
186                              )
187                          )?                                                   # optional -> youtube.com/xxxx is OK
188                      )?                                                       # all until now is optional -> you can pass the naked ID
189                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
190                      (?(1).+)?                                                # if we found the ID, everything can follow
191                      $"""
192     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
193     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
194     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
195     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
196     _NETRC_MACHINE = 'youtube'
197     # Listed in order of quality
198     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
199     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
200     _video_extensions = {
201         '13': '3gp',
202         '17': 'mp4',
203         '18': 'mp4',
204         '22': 'mp4',
205         '37': 'mp4',
206         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
207         '43': 'webm',
208         '44': 'webm',
209         '45': 'webm',
210         '46': 'webm',
211     }
212     _video_dimensions = {
213         '5': '240x400',
214         '6': '???',
215         '13': '???',
216         '17': '144x176',
217         '18': '360x640',
218         '22': '720x1280',
219         '34': '360x640',
220         '35': '480x854',
221         '37': '1080x1920',
222         '38': '3072x4096',
223         '43': '360x640',
224         '44': '480x854',
225         '45': '720x1280',
226         '46': '1080x1920',
227     }
228     IE_NAME = u'youtube'
229
230     @classmethod
231     def suitable(cls, url):
232         """Receives a URL and returns True if suitable for this IE."""
233         if YoutubePlaylistIE.suitable(url): return False
234         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
235
236     def report_lang(self):
237         """Report attempt to set language."""
238         self._downloader.to_screen(u'[youtube] Setting language')
239
240     def report_login(self):
241         """Report attempt to log in."""
242         self._downloader.to_screen(u'[youtube] Logging in')
243
244     def report_age_confirmation(self):
245         """Report attempt to confirm age."""
246         self._downloader.to_screen(u'[youtube] Confirming age')
247
248     def report_video_webpage_download(self, video_id):
249         """Report attempt to download video webpage."""
250         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
251
252     def report_video_info_webpage_download(self, video_id):
253         """Report attempt to download video info webpage."""
254         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
255
256     def report_video_subtitles_download(self, video_id):
257         """Report attempt to download video info webpage."""
258         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
259
260     def report_video_subtitles_request(self, video_id, sub_lang, format):
261         """Report attempt to download video info webpage."""
262         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
263
264     def report_video_subtitles_available(self, video_id, sub_lang_list):
265         """Report available subtitles."""
266         sub_lang = ",".join(list(sub_lang_list.keys()))
267         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
268
269     def report_information_extraction(self, video_id):
270         """Report attempt to extract video information."""
271         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
272
273     def report_unavailable_format(self, video_id, format):
274         """Report extracted video URL."""
275         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
276
277     def report_rtmp_download(self):
278         """Indicate the download will use the RTMP protocol."""
279         self._downloader.to_screen(u'[youtube] RTMP download detected')
280
281     def _get_available_subtitles(self, video_id):
282         self.report_video_subtitles_download(video_id)
283         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
284         try:
285             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
286         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
287             return (u'unable to download video subtitles: %s' % compat_str(err), None)
288         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
289         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
290         if not sub_lang_list:
291             return (u'video doesn\'t have subtitles', None)
292         return sub_lang_list
293
294     def _list_available_subtitles(self, video_id):
295         sub_lang_list = self._get_available_subtitles(video_id)
296         self.report_video_subtitles_available(video_id, sub_lang_list)
297
298     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
299         """
300         Return tuple:
301         (error_message, sub_lang, sub)
302         """
303         self.report_video_subtitles_request(video_id, sub_lang, format)
304         params = compat_urllib_parse.urlencode({
305             'lang': sub_lang,
306             'name': sub_name,
307             'v': video_id,
308             'fmt': format,
309         })
310         url = 'http://www.youtube.com/api/timedtext?' + params
311         try:
312             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
313         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
315         if not sub:
316             return (u'Did not fetch video subtitles', None, None)
317         return (None, sub_lang, sub)
318
319     def _extract_subtitle(self, video_id):
320         """
321         Return a list with a tuple:
322         [(error_message, sub_lang, sub)]
323         """
324         sub_lang_list = self._get_available_subtitles(video_id)
325         sub_format = self._downloader.params.get('subtitlesformat')
326         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
327             return [(sub_lang_list[0], None, None)]
328         if self._downloader.params.get('subtitleslang', False):
329             sub_lang = self._downloader.params.get('subtitleslang')
330         elif 'en' in sub_lang_list:
331             sub_lang = 'en'
332         else:
333             sub_lang = list(sub_lang_list.keys())[0]
334         if not sub_lang in sub_lang_list:
335             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
336
337         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
338         return [subtitle]
339
340     def _extract_all_subtitles(self, video_id):
341         sub_lang_list = self._get_available_subtitles(video_id)
342         sub_format = self._downloader.params.get('subtitlesformat')
343         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
344             return [(sub_lang_list[0], None, None)]
345         subtitles = []
346         for sub_lang in sub_lang_list:
347             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
348             subtitles.append(subtitle)
349         return subtitles
350
351     def _print_formats(self, formats):
352         print('Available formats:')
353         for x in formats:
354             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
355
356     def _real_initialize(self):
357         if self._downloader is None:
358             return
359
360         username = None
361         password = None
362         downloader_params = self._downloader.params
363
364         # Attempt to use provided username and password or .netrc data
365         if downloader_params.get('username', None) is not None:
366             username = downloader_params['username']
367             password = downloader_params['password']
368         elif downloader_params.get('usenetrc', False):
369             try:
370                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
371                 if info is not None:
372                     username = info[0]
373                     password = info[2]
374                 else:
375                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376             except (IOError, netrc.NetrcParseError) as err:
377                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
378                 return
379
380         # Set language
381         request = compat_urllib_request.Request(self._LANG_URL)
382         try:
383             self.report_lang()
384             compat_urllib_request.urlopen(request).read()
385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
386             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
387             return
388
389         # No authentication to be performed
390         if username is None:
391             return
392
393         request = compat_urllib_request.Request(self._LOGIN_URL)
394         try:
395             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
398             return
399
400         galx = None
401         dsh = None
402         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
403         if match:
404           galx = match.group(1)
405
406         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
407         if match:
408           dsh = match.group(1)
409
410         # Log in
411         login_form_strs = {
412                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
413                 u'Email': username,
414                 u'GALX': galx,
415                 u'Passwd': password,
416                 u'PersistentCookie': u'yes',
417                 u'_utf8': u'霱',
418                 u'bgresponse': u'js_disabled',
419                 u'checkConnection': u'',
420                 u'checkedDomains': u'youtube',
421                 u'dnConn': u'',
422                 u'dsh': dsh,
423                 u'pstMsg': u'0',
424                 u'rmShown': u'1',
425                 u'secTok': u'',
426                 u'signIn': u'Sign in',
427                 u'timeStmp': u'',
428                 u'service': u'youtube',
429                 u'uilel': u'3',
430                 u'hl': u'en_US',
431         }
432         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
433         # chokes on unicode
434         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
435         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
436         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
437         try:
438             self.report_login()
439             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
440             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
441                 self._downloader.report_warning(u'unable to log in: bad username or password')
442                 return
443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
444             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
445             return
446
447         # Confirm age
448         age_form = {
449                 'next_url':     '/',
450                 'action_confirm':   'Confirm',
451                 }
452         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
453         try:
454             self.report_age_confirmation()
455             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
458             return
459
460     def _extract_id(self, url):
461         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
462         if mobj is None:
463             self._downloader.report_error(u'invalid URL: %s' % url)
464             return
465         video_id = mobj.group(2)
466         return video_id
467
468     def _real_extract(self, url):
469         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
470         mobj = re.search(self._NEXT_URL_RE, url)
471         if mobj:
472             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
473         video_id = self._extract_id(url)
474
475         # Get video webpage
476         self.report_video_webpage_download(video_id)
477         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
478         request = compat_urllib_request.Request(url)
479         try:
480             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
481         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
482             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
483             return
484
485         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
486
487         # Attempt to extract SWF player URL
488         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
489         if mobj is not None:
490             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
491         else:
492             player_url = None
493
494         # Get video info
495         self.report_video_info_webpage_download(video_id)
496         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
497             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
498                     % (video_id, el_type))
499             video_info_webpage = self._download_webpage(video_info_url, video_id,
500                                     note=False,
501                                     errnote='unable to download video info webpage')
502             video_info = compat_parse_qs(video_info_webpage)
503             if 'token' in video_info:
504                 break
505         if 'token' not in video_info:
506             if 'reason' in video_info:
507                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
508             else:
509                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
510             return
511
512         # Check for "rental" videos
513         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
514             self._downloader.report_error(u'"rental" videos not supported')
515             return
516
517         # Start extracting information
518         self.report_information_extraction(video_id)
519
520         # uploader
521         if 'author' not in video_info:
522             self._downloader.report_error(u'unable to extract uploader name')
523             return
524         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
525
526         # uploader_id
527         video_uploader_id = None
528         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
529         if mobj is not None:
530             video_uploader_id = mobj.group(1)
531         else:
532             self._downloader.report_warning(u'unable to extract uploader nickname')
533
534         # title
535         if 'title' not in video_info:
536             self._downloader.report_error(u'unable to extract video title')
537             return
538         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
539
540         # thumbnail image
541         if 'thumbnail_url' not in video_info:
542             self._downloader.report_warning(u'unable to extract video thumbnail')
543             video_thumbnail = ''
544         else:   # don't panic if we can't find it
545             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
546
547         # upload date
548         upload_date = None
549         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
550         if mobj is not None:
551             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
552             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
553             for expression in format_expressions:
554                 try:
555                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
556                 except:
557                     pass
558
559         # description
560         video_description = get_element_by_id("eow-description", video_webpage)
561         if video_description:
562             video_description = clean_html(video_description)
563         else:
564             video_description = ''
565
566         # subtitles
567         video_subtitles = None
568
569         if self._downloader.params.get('writesubtitles', False):
570             video_subtitles = self._extract_subtitle(video_id)
571             if video_subtitles:
572                 (sub_error, sub_lang, sub) = video_subtitles[0]
573                 if sub_error:
574                     self._downloader.report_error(sub_error)
575
576         if self._downloader.params.get('allsubtitles', False):
577             video_subtitles = self._extract_all_subtitles(video_id)
578             for video_subtitle in video_subtitles:
579                 (sub_error, sub_lang, sub) = video_subtitle
580                 if sub_error:
581                     self._downloader.report_error(sub_error)
582
583         if self._downloader.params.get('listsubtitles', False):
584             sub_lang_list = self._list_available_subtitles(video_id)
585             return
586
587         if 'length_seconds' not in video_info:
588             self._downloader.report_warning(u'unable to extract video duration')
589             video_duration = ''
590         else:
591             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
592
593         # token
594         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
595
596         # Decide which formats to download
597         req_format = self._downloader.params.get('format', None)
598
599         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
600             self.report_rtmp_download()
601             video_url_list = [(None, video_info['conn'][0])]
602         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
603             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
604             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
605             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
606             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
607
608             format_limit = self._downloader.params.get('format_limit', None)
609             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
610             if format_limit is not None and format_limit in available_formats:
611                 format_list = available_formats[available_formats.index(format_limit):]
612             else:
613                 format_list = available_formats
614             existing_formats = [x for x in format_list if x in url_map]
615             if len(existing_formats) == 0:
616                 self._downloader.report_error(u'no known formats available for video')
617                 return
618             if self._downloader.params.get('listformats', None):
619                 self._print_formats(existing_formats)
620                 return
621             if req_format is None or req_format == 'best':
622                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
623             elif req_format == 'worst':
624                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
625             elif req_format in ('-1', 'all'):
626                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
627             else:
628                 # Specific formats. We pick the first in a slash-delimeted sequence.
629                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
630                 req_formats = req_format.split('/')
631                 video_url_list = None
632                 for rf in req_formats:
633                     if rf in url_map:
634                         video_url_list = [(rf, url_map[rf])]
635                         break
636                 if video_url_list is None:
637                     self._downloader.report_error(u'requested format not available')
638                     return
639         else:
640             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
641             return
642
643         results = []
644         for format_param, video_real_url in video_url_list:
645             # Extension
646             video_extension = self._video_extensions.get(format_param, 'flv')
647
648             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
649                                               self._video_dimensions.get(format_param, '???'))
650
651             results.append({
652                 'id':       video_id,
653                 'url':      video_real_url,
654                 'uploader': video_uploader,
655                 'uploader_id': video_uploader_id,
656                 'upload_date':  upload_date,
657                 'title':    video_title,
658                 'ext':      video_extension,
659                 'format':   video_format,
660                 'thumbnail':    video_thumbnail,
661                 'description':  video_description,
662                 'player_url':   player_url,
663                 'subtitles':    video_subtitles,
664                 'duration':     video_duration
665             })
666         return results
667
668
669 class MetacafeIE(InfoExtractor):
670     """Information Extractor for metacafe.com."""
671
672     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
673     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
674     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
675     IE_NAME = u'metacafe'
676
677     def __init__(self, downloader=None):
678         InfoExtractor.__init__(self, downloader)
679
680     def report_disclaimer(self):
681         """Report disclaimer retrieval."""
682         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
683
684     def report_age_confirmation(self):
685         """Report attempt to confirm age."""
686         self._downloader.to_screen(u'[metacafe] Confirming age')
687
688     def report_download_webpage(self, video_id):
689         """Report webpage download."""
690         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
691
692     def report_extraction(self, video_id):
693         """Report information extraction."""
694         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
695
696     def _real_initialize(self):
697         # Retrieve disclaimer
698         request = compat_urllib_request.Request(self._DISCLAIMER)
699         try:
700             self.report_disclaimer()
701             disclaimer = compat_urllib_request.urlopen(request).read()
702         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
703             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
704             return
705
706         # Confirm age
707         disclaimer_form = {
708             'filters': '0',
709             'submit': "Continue - I'm over 18",
710             }
711         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712         try:
713             self.report_age_confirmation()
714             disclaimer = compat_urllib_request.urlopen(request).read()
715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
717             return
718
719     def _real_extract(self, url):
720         # Extract id and simplified title from URL
721         mobj = re.match(self._VALID_URL, url)
722         if mobj is None:
723             self._downloader.report_error(u'invalid URL: %s' % url)
724             return
725
726         video_id = mobj.group(1)
727
728         # Check if video comes from YouTube
729         mobj2 = re.match(r'^yt-(.*)$', video_id)
730         if mobj2 is not None:
731             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
732
733         # Retrieve video webpage to extract further information
734         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
735         webpage = self._download_webpage(request, video_id)
736
737         # Extract URL, uploader and title from webpage
738         self.report_extraction(video_id)
739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
740         if mobj is not None:
741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742             video_extension = mediaURL[-3:]
743
744             # Extract gdaKey if available
745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
746             if mobj is None:
747                 video_url = mediaURL
748             else:
749                 gdaKey = mobj.group(1)
750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
751         else:
752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
753             if mobj is None:
754                 self._downloader.report_error(u'unable to extract media URL')
755                 return
756             vardict = compat_parse_qs(mobj.group(1))
757             if 'mediaData' not in vardict:
758                 self._downloader.report_error(u'unable to extract media URL')
759                 return
760             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
761             if mobj is None:
762                 self._downloader.report_error(u'unable to extract media URL')
763                 return
764             mediaURL = mobj.group(1).replace('\\/', '/')
765             video_extension = mediaURL[-3:]
766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
767
768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
769         if mobj is None:
770             self._downloader.report_error(u'unable to extract title')
771             return
772         video_title = mobj.group(1).decode('utf-8')
773
774         mobj = re.search(r'submitter=(.*?);', webpage)
775         if mobj is None:
776             self._downloader.report_error(u'unable to extract uploader nickname')
777             return
778         video_uploader = mobj.group(1)
779
780         return [{
781             'id':       video_id.decode('utf-8'),
782             'url':      video_url.decode('utf-8'),
783             'uploader': video_uploader.decode('utf-8'),
784             'upload_date':  None,
785             'title':    video_title,
786             'ext':      video_extension.decode('utf-8'),
787         }]
788
789
790 class DailymotionIE(InfoExtractor):
791     """Information Extractor for Dailymotion"""
792
793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794     IE_NAME = u'dailymotion'
795     _WORKING = False
796
797     def __init__(self, downloader=None):
798         InfoExtractor.__init__(self, downloader)
799
800     def report_extraction(self, video_id):
801         """Report information extraction."""
802         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
803
804     def _real_extract(self, url):
805         # Extract id and simplified title from URL
806         mobj = re.match(self._VALID_URL, url)
807         if mobj is None:
808             self._downloader.report_error(u'invalid URL: %s' % url)
809             return
810
811         video_id = mobj.group(1).split('_')[0].split('?')[0]
812
813         video_extension = 'mp4'
814
815         # Retrieve video webpage to extract further information
816         request = compat_urllib_request.Request(url)
817         request.add_header('Cookie', 'family_filter=off')
818         webpage = self._download_webpage(request, video_id)
819
820         # Extract URL, uploader and title from webpage
821         self.report_extraction(video_id)
822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
823         if mobj is None:
824             self._downloader.report_error(u'unable to extract media URL')
825             return
826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
827
828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
829             if key in flashvars:
830                 max_quality = key
831                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
832                 break
833         else:
834             self._downloader.report_error(u'unable to extract video URL')
835             return
836
837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
838         if mobj is None:
839             self._downloader.report_error(u'unable to extract video URL')
840             return
841
842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
843
844         # TODO: support choosing qualities
845
846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
847         if mobj is None:
848             self._downloader.report_error(u'unable to extract title')
849             return
850         video_title = unescapeHTML(mobj.group('title'))
851
852         video_uploader = None
853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
854         if mobj is None:
855             # lookin for official user
856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857             if mobj_official is None:
858                 self._downloader.report_warning(u'unable to extract uploader nickname')
859             else:
860                 video_uploader = mobj_official.group(1)
861         else:
862             video_uploader = mobj.group(1)
863
864         video_upload_date = None
865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
866         if mobj is not None:
867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
868
869         return [{
870             'id':       video_id,
871             'url':      video_url,
872             'uploader': video_uploader,
873             'upload_date':  video_upload_date,
874             'title':    video_title,
875             'ext':      video_extension,
876         }]
877
878
879 class PhotobucketIE(InfoExtractor):
880     """Information extractor for photobucket.com."""
881
882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883     IE_NAME = u'photobucket'
884
885     def __init__(self, downloader=None):
886         InfoExtractor.__init__(self, downloader)
887
888     def report_download_webpage(self, video_id):
889         """Report webpage download."""
890         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
891
892     def report_extraction(self, video_id):
893         """Report information extraction."""
894         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
895
896     def _real_extract(self, url):
897         # Extract id from URL
898         mobj = re.match(self._VALID_URL, url)
899         if mobj is None:
900             self._downloader.report_error(u'Invalid URL: %s' % url)
901             return
902
903         video_id = mobj.group(1)
904
905         video_extension = 'flv'
906
907         # Retrieve video webpage to extract further information
908         request = compat_urllib_request.Request(url)
909         try:
910             self.report_download_webpage(video_id)
911             webpage = compat_urllib_request.urlopen(request).read()
912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
914             return
915
916         # Extract URL, uploader, and title from webpage
917         self.report_extraction(video_id)
918         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
919         if mobj is None:
920             self._downloader.report_error(u'unable to extract media URL')
921             return
922         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
923
924         video_url = mediaURL
925
926         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
927         if mobj is None:
928             self._downloader.report_error(u'unable to extract title')
929             return
930         video_title = mobj.group(1).decode('utf-8')
931
932         video_uploader = mobj.group(2).decode('utf-8')
933
934         return [{
935             'id':       video_id.decode('utf-8'),
936             'url':      video_url.decode('utf-8'),
937             'uploader': video_uploader,
938             'upload_date':  None,
939             'title':    video_title,
940             'ext':      video_extension.decode('utf-8'),
941         }]
942
943
944 class YahooIE(InfoExtractor):
945     """Information extractor for video.yahoo.com."""
946
947     _WORKING = False
948     # _VALID_URL matches all Yahoo! Video URLs
949     # _VPAGE_URL matches only the extractable '/watch/' URLs
950     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952     IE_NAME = u'video.yahoo'
953
954     def __init__(self, downloader=None):
955         InfoExtractor.__init__(self, downloader)
956
957     def report_download_webpage(self, video_id):
958         """Report webpage download."""
959         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
960
961     def report_extraction(self, video_id):
962         """Report information extraction."""
963         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
964
965     def _real_extract(self, url, new_video=True):
966         # Extract ID from URL
967         mobj = re.match(self._VALID_URL, url)
968         if mobj is None:
969             self._downloader.report_error(u'Invalid URL: %s' % url)
970             return
971
972         video_id = mobj.group(2)
973         video_extension = 'flv'
974
975         # Rewrite valid but non-extractable URLs as
976         # extractable English language /watch/ URLs
977         if re.match(self._VPAGE_URL, url) is None:
978             request = compat_urllib_request.Request(url)
979             try:
980                 webpage = compat_urllib_request.urlopen(request).read()
981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
983                 return
984
985             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
986             if mobj is None:
987                 self._downloader.report_error(u'Unable to extract id field')
988                 return
989             yahoo_id = mobj.group(1)
990
991             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
992             if mobj is None:
993                 self._downloader.report_error(u'Unable to extract vid field')
994                 return
995             yahoo_vid = mobj.group(1)
996
997             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998             return self._real_extract(url, new_video=False)
999
1000         # Retrieve video webpage to extract further information
1001         request = compat_urllib_request.Request(url)
1002         try:
1003             self.report_download_webpage(video_id)
1004             webpage = compat_urllib_request.urlopen(request).read()
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Extract uploader and title from webpage
1010         self.report_extraction(video_id)
1011         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1012         if mobj is None:
1013             self._downloader.report_error(u'unable to extract video title')
1014             return
1015         video_title = mobj.group(1).decode('utf-8')
1016
1017         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video uploader')
1020             return
1021         video_uploader = mobj.group(1).decode('utf-8')
1022
1023         # Extract video thumbnail
1024         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video thumbnail')
1027             return
1028         video_thumbnail = mobj.group(1).decode('utf-8')
1029
1030         # Extract video description
1031         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video description')
1034             return
1035         video_description = mobj.group(1).decode('utf-8')
1036         if not video_description:
1037             video_description = 'No description available.'
1038
1039         # Extract video height and width
1040         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1041         if mobj is None:
1042             self._downloader.report_error(u'unable to extract video height')
1043             return
1044         yv_video_height = mobj.group(1)
1045
1046         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1047         if mobj is None:
1048             self._downloader.report_error(u'unable to extract video width')
1049             return
1050         yv_video_width = mobj.group(1)
1051
1052         # Retrieve video playlist to extract media URL
1053         # I'm not completely sure what all these options are, but we
1054         # seem to need most of them, otherwise the server sends a 401.
1055         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1056         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1057         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Extract media URL from playlist XML
1068         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1069         if mobj is None:
1070             self._downloader.report_error(u'Unable to extract media URL')
1071             return
1072         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073         video_url = unescapeHTML(video_url)
1074
1075         return [{
1076             'id':       video_id.decode('utf-8'),
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'upload_date':  None,
1080             'title':    video_title,
1081             'ext':      video_extension.decode('utf-8'),
1082             'thumbnail':    video_thumbnail.decode('utf-8'),
1083             'description':  video_description,
1084         }]
1085
1086
1087 class VimeoIE(InfoExtractor):
1088     """Information extractor for vimeo.com."""
1089
1090     # _VALID_URL matches Vimeo URLs
1091     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092     IE_NAME = u'vimeo'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1104
1105     def _real_extract(self, url, new_video=True):
1106         # Extract ID from URL
1107         mobj = re.match(self._VALID_URL, url)
1108         if mobj is None:
1109             self._downloader.report_error(u'Invalid URL: %s' % url)
1110             return
1111
1112         video_id = mobj.group('id')
1113         if not mobj.group('proto'):
1114             url = 'https://' + url
1115         if mobj.group('direct_link'):
1116             url = 'https://vimeo.com/' + video_id
1117
1118         # Retrieve video webpage to extract further information
1119         request = compat_urllib_request.Request(url, None, std_headers)
1120         try:
1121             self.report_download_webpage(video_id)
1122             webpage_bytes = compat_urllib_request.urlopen(request).read()
1123             webpage = webpage_bytes.decode('utf-8')
1124         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1126             return
1127
1128         # Now we begin extracting as much information as we can from what we
1129         # retrieved. First we extract the information common to all extractors,
1130         # and latter we extract those that are Vimeo specific.
1131         self.report_extraction(video_id)
1132
1133         # Extract the config JSON
1134         try:
1135             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136             config = json.loads(config)
1137         except:
1138             self._downloader.report_error(u'unable to extract info section')
1139             return
1140
1141         # Extract title
1142         video_title = config["video"]["title"]
1143
1144         # Extract uploader and uploader_id
1145         video_uploader = config["video"]["owner"]["name"]
1146         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1147
1148         # Extract video thumbnail
1149         video_thumbnail = config["video"]["thumbnail"]
1150
1151         # Extract video description
1152         video_description = get_element_by_attribute("itemprop", "description", webpage)
1153         if video_description: video_description = clean_html(video_description)
1154         else: video_description = u''
1155
1156         # Extract upload date
1157         video_upload_date = None
1158         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159         if mobj is not None:
1160             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1161
1162         # Vimeo specific: extract request signature and timestamp
1163         sig = config['request']['signature']
1164         timestamp = config['request']['timestamp']
1165
1166         # Vimeo specific: extract video codec and quality information
1167         # First consider quality, then codecs, then take everything
1168         # TODO bind to format param
1169         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170         files = { 'hd': [], 'sd': [], 'other': []}
1171         for codec_name, codec_extension in codecs:
1172             if codec_name in config["video"]["files"]:
1173                 if 'hd' in config["video"]["files"][codec_name]:
1174                     files['hd'].append((codec_name, codec_extension, 'hd'))
1175                 elif 'sd' in config["video"]["files"][codec_name]:
1176                     files['sd'].append((codec_name, codec_extension, 'sd'))
1177                 else:
1178                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1179
1180         for quality in ('hd', 'sd', 'other'):
1181             if len(files[quality]) > 0:
1182                 video_quality = files[quality][0][2]
1183                 video_codec = files[quality][0][0]
1184                 video_extension = files[quality][0][1]
1185                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1186                 break
1187         else:
1188             self._downloader.report_error(u'no known codec found')
1189             return
1190
1191         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1193
1194         return [{
1195             'id':       video_id,
1196             'url':      video_url,
1197             'uploader': video_uploader,
1198             'uploader_id': video_uploader_id,
1199             'upload_date':  video_upload_date,
1200             'title':    video_title,
1201             'ext':      video_extension,
1202             'thumbnail':    video_thumbnail,
1203             'description':  video_description,
1204         }]
1205
1206
1207 class ArteTvIE(InfoExtractor):
1208     """arte.tv information extractor."""
1209
1210     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211     _LIVE_URL = r'index-[0-9]+\.html$'
1212
1213     IE_NAME = u'arte.tv'
1214
1215     def __init__(self, downloader=None):
1216         InfoExtractor.__init__(self, downloader)
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1221
1222     def report_extraction(self, video_id):
1223         """Report information extraction."""
1224         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233             return
1234         except ValueError as err:
1235             self._downloader.report_error(u'Invalid URL: %s' % url)
1236             return
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             self._downloader.report_error(u'Invalid URL: %s' % url)
1246             return
1247
1248         for (i, key, err) in matchTuples:
1249             if mobj.group(i) is None:
1250                 self._downloader.trouble(err)
1251                 return
1252             else:
1253                 info[key] = mobj.group(i)
1254
1255         return info
1256
1257     def extractLiveStream(self, url):
1258         video_lang = url.split('/')[-4]
1259         info = self.grep_webpage(
1260             url,
1261             r'src="(.*?/videothek_js.*?\.js)',
1262             0,
1263             [
1264                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1265             ]
1266         )
1267         http_host = url.split('/')[2]
1268         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269         info = self.grep_webpage(
1270             next_url,
1271             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272                 '(http://.*?\.swf).*?' +
1273                 '(rtmp://.*?)\'',
1274             re.DOTALL,
1275             [
1276                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1277                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1279             ]
1280         )
1281         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1282
1283     def extractPlus7Stream(self, url):
1284         video_lang = url.split('/')[-3]
1285         info = self.grep_webpage(
1286             url,
1287             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288             0,
1289             [
1290                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1291             ]
1292         )
1293         next_url = compat_urllib_parse.unquote(info.get('url'))
1294         info = self.grep_webpage(
1295             next_url,
1296             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297             0,
1298             [
1299                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1300             ]
1301         )
1302         next_url = compat_urllib_parse.unquote(info.get('url'))
1303
1304         info = self.grep_webpage(
1305             next_url,
1306             r'<video id="(.*?)".*?>.*?' +
1307                 '<name>(.*?)</name>.*?' +
1308                 '<dateVideo>(.*?)</dateVideo>.*?' +
1309                 '<url quality="hd">(.*?)</url>',
1310             re.DOTALL,
1311             [
1312                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1313                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1315                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1316             ]
1317         )
1318
1319         return {
1320             'id':           info.get('id'),
1321             'url':          compat_urllib_parse.unquote(info.get('url')),
1322             'uploader':     u'arte.tv',
1323             'upload_date':  info.get('date'),
1324             'title':        info.get('title').decode('utf-8'),
1325             'ext':          u'mp4',
1326             'format':       u'NA',
1327             'player_url':   None,
1328         }
1329
1330     def _real_extract(self, url):
1331         video_id = url.split('/')[-1]
1332         self.report_extraction(video_id)
1333
1334         if re.search(self._LIVE_URL, video_id) is not None:
1335             self.extractLiveStream(url)
1336             return
1337         else:
1338             info = self.extractPlus7Stream(url)
1339
1340         return [info]
1341
1342
1343 class GenericIE(InfoExtractor):
1344     """Generic last-resort information extractor."""
1345
1346     _VALID_URL = r'.*'
1347     IE_NAME = u'generic'
1348
1349     def __init__(self, downloader=None):
1350         InfoExtractor.__init__(self, downloader)
1351
1352     def report_download_webpage(self, video_id):
1353         """Report webpage download."""
1354         if not self._downloader.params.get('test', False):
1355             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1357
1358     def report_extraction(self, video_id):
1359         """Report information extraction."""
1360         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1361
1362     def report_following_redirect(self, new_url):
1363         """Report information extraction."""
1364         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1365
1366     def _test_redirect(self, url):
1367         """Check if it is a redirect, like url shorteners, in case return the new url."""
1368         class HeadRequest(compat_urllib_request.Request):
1369             def get_method(self):
1370                 return "HEAD"
1371
1372         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1373             """
1374             Subclass the HTTPRedirectHandler to make it use our
1375             HeadRequest also on the redirected URL
1376             """
1377             def redirect_request(self, req, fp, code, msg, headers, newurl):
1378                 if code in (301, 302, 303, 307):
1379                     newurl = newurl.replace(' ', '%20')
1380                     newheaders = dict((k,v) for k,v in req.headers.items()
1381                                       if k.lower() not in ("content-length", "content-type"))
1382                     return HeadRequest(newurl,
1383                                        headers=newheaders,
1384                                        origin_req_host=req.get_origin_req_host(),
1385                                        unverifiable=True)
1386                 else:
1387                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1388
1389         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1390             """
1391             Fallback to GET if HEAD is not allowed (405 HTTP error)
1392             """
1393             def http_error_405(self, req, fp, code, msg, headers):
1394                 fp.read()
1395                 fp.close()
1396
1397                 newheaders = dict((k,v) for k,v in req.headers.items()
1398                                   if k.lower() not in ("content-length", "content-type"))
1399                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1400                                                  headers=newheaders,
1401                                                  origin_req_host=req.get_origin_req_host(),
1402                                                  unverifiable=True))
1403
1404         # Build our opener
1405         opener = compat_urllib_request.OpenerDirector()
1406         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407                         HTTPMethodFallback, HEADRedirectHandler,
1408                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409             opener.add_handler(handler())
1410
1411         response = opener.open(HeadRequest(url))
1412         new_url = response.geturl()
1413
1414         if url == new_url:
1415             return False
1416
1417         self.report_following_redirect(new_url)
1418         return new_url
1419
1420     def _real_extract(self, url):
1421         new_url = self._test_redirect(url)
1422         if new_url: return [self.url_result(new_url)]
1423
1424         video_id = url.split('/')[-1]
1425         try:
1426             webpage = self._download_webpage(url, video_id)
1427         except ValueError as err:
1428             # since this is the last-resort InfoExtractor, if
1429             # this error is thrown, it'll be thrown here
1430             self._downloader.report_error(u'Invalid URL: %s' % url)
1431             return
1432
1433         self.report_extraction(video_id)
1434         # Start with something easy: JW Player in SWFObject
1435         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1436         if mobj is None:
1437             # Broaden the search a little bit
1438             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1439         if mobj is None:
1440             # Broaden the search a little bit: JWPlayer JS loader
1441             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'Invalid URL: %s' % url)
1444             return
1445
1446         # It's possible that one of the regexes
1447         # matched, but returned an empty group:
1448         if mobj.group(1) is None:
1449             self._downloader.report_error(u'Invalid URL: %s' % url)
1450             return
1451
1452         video_url = compat_urllib_parse.unquote(mobj.group(1))
1453         video_id = os.path.basename(video_url)
1454
1455         # here's a fun little line of code for you:
1456         video_extension = os.path.splitext(video_id)[1][1:]
1457         video_id = os.path.splitext(video_id)[0]
1458
1459         # it's tempting to parse this further, but you would
1460         # have to take into account all the variations like
1461         #   Video Title - Site Name
1462         #   Site Name | Video Title
1463         #   Video Title - Tagline | Site Name
1464         # and so on and so forth; it's just not practical
1465         mobj = re.search(r'<title>(.*)</title>', webpage)
1466         if mobj is None:
1467             self._downloader.report_error(u'unable to extract title')
1468             return
1469         video_title = mobj.group(1)
1470
1471         # video uploader is domain name
1472         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1473         if mobj is None:
1474             self._downloader.report_error(u'unable to extract title')
1475             return
1476         video_uploader = mobj.group(1)
1477
1478         return [{
1479             'id':       video_id,
1480             'url':      video_url,
1481             'uploader': video_uploader,
1482             'upload_date':  None,
1483             'title':    video_title,
1484             'ext':      video_extension,
1485         }]
1486
1487
1488 class YoutubeSearchIE(InfoExtractor):
1489     """Information Extractor for YouTube search queries."""
1490     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492     _max_youtube_results = 1000
1493     IE_NAME = u'youtube:search'
1494
1495     def __init__(self, downloader=None):
1496         InfoExtractor.__init__(self, downloader)
1497
1498     def report_download_page(self, query, pagenum):
1499         """Report attempt to download search page with given number."""
1500         query = query.decode(preferredencoding())
1501         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1502
1503     def _real_extract(self, query):
1504         mobj = re.match(self._VALID_URL, query)
1505         if mobj is None:
1506             self._downloader.report_error(u'invalid search query "%s"' % query)
1507             return
1508
1509         prefix, query = query.split(':')
1510         prefix = prefix[8:]
1511         query = query.encode('utf-8')
1512         if prefix == '':
1513             self._download_n_results(query, 1)
1514             return
1515         elif prefix == 'all':
1516             self._download_n_results(query, self._max_youtube_results)
1517             return
1518         else:
1519             try:
1520                 n = int(prefix)
1521                 if n <= 0:
1522                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1523                     return
1524                 elif n > self._max_youtube_results:
1525                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526                     n = self._max_youtube_results
1527                 self._download_n_results(query, n)
1528                 return
1529             except ValueError: # parsing prefix as integer fails
1530                 self._download_n_results(query, 1)
1531                 return
1532
1533     def _download_n_results(self, query, n):
1534         """Downloads a specified number of results for a query"""
1535
1536         video_ids = []
1537         pagenum = 0
1538         limit = n
1539
1540         while (50 * pagenum) < limit:
1541             self.report_download_page(query, pagenum+1)
1542             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1548                 return
1549             api_response = json.loads(data)['data']
1550
1551             if not 'items' in api_response:
1552                 self._downloader.trouble(u'[youtube] No video results')
1553                 return
1554
1555             new_ids = list(video['id'] for video in api_response['items'])
1556             video_ids += new_ids
1557
1558             limit = min(n, api_response['totalItems'])
1559             pagenum += 1
1560
1561         if len(video_ids) > n:
1562             video_ids = video_ids[:n]
1563         for id in video_ids:
1564             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565         return
1566
1567
1568 class GoogleSearchIE(InfoExtractor):
1569     """Information Extractor for Google Video search queries."""
1570     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574     _max_google_results = 1000
1575     IE_NAME = u'video.google:search'
1576
1577     def __init__(self, downloader=None):
1578         InfoExtractor.__init__(self, downloader)
1579
1580     def report_download_page(self, query, pagenum):
1581         """Report attempt to download playlist page with given number."""
1582         query = query.decode(preferredencoding())
1583         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1584
1585     def _real_extract(self, query):
1586         mobj = re.match(self._VALID_URL, query)
1587         if mobj is None:
1588             self._downloader.report_error(u'invalid search query "%s"' % query)
1589             return
1590
1591         prefix, query = query.split(':')
1592         prefix = prefix[8:]
1593         query = query.encode('utf-8')
1594         if prefix == '':
1595             self._download_n_results(query, 1)
1596             return
1597         elif prefix == 'all':
1598             self._download_n_results(query, self._max_google_results)
1599             return
1600         else:
1601             try:
1602                 n = int(prefix)
1603                 if n <= 0:
1604                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605                     return
1606                 elif n > self._max_google_results:
1607                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608                     n = self._max_google_results
1609                 self._download_n_results(query, n)
1610                 return
1611             except ValueError: # parsing prefix as integer fails
1612                 self._download_n_results(query, 1)
1613                 return
1614
1615     def _download_n_results(self, query, n):
1616         """Downloads a specified number of results for a query"""
1617
1618         video_ids = []
1619         pagenum = 0
1620
1621         while True:
1622             self.report_download_page(query, pagenum)
1623             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624             request = compat_urllib_request.Request(result_url)
1625             try:
1626                 page = compat_urllib_request.urlopen(request).read()
1627             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1629                 return
1630
1631             # Extract video identifiers
1632             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633                 video_id = mobj.group(1)
1634                 if video_id not in video_ids:
1635                     video_ids.append(video_id)
1636                     if len(video_ids) == n:
1637                         # Specified n videos reached
1638                         for id in video_ids:
1639                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1640                         return
1641
1642             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643                 for id in video_ids:
1644                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                 return
1646
1647             pagenum = pagenum + 1
1648
1649
1650 class YahooSearchIE(InfoExtractor):
1651     """Information Extractor for Yahoo! Video search queries."""
1652
1653     _WORKING = False
1654     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657     _MORE_PAGES_INDICATOR = r'\s*Next'
1658     _max_yahoo_results = 1000
1659     IE_NAME = u'video.yahoo:search'
1660
1661     def __init__(self, downloader=None):
1662         InfoExtractor.__init__(self, downloader)
1663
1664     def report_download_page(self, query, pagenum):
1665         """Report attempt to download playlist page with given number."""
1666         query = query.decode(preferredencoding())
1667         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1668
1669     def _real_extract(self, query):
1670         mobj = re.match(self._VALID_URL, query)
1671         if mobj is None:
1672             self._downloader.report_error(u'invalid search query "%s"' % query)
1673             return
1674
1675         prefix, query = query.split(':')
1676         prefix = prefix[8:]
1677         query = query.encode('utf-8')
1678         if prefix == '':
1679             self._download_n_results(query, 1)
1680             return
1681         elif prefix == 'all':
1682             self._download_n_results(query, self._max_yahoo_results)
1683             return
1684         else:
1685             try:
1686                 n = int(prefix)
1687                 if n <= 0:
1688                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1689                     return
1690                 elif n > self._max_yahoo_results:
1691                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692                     n = self._max_yahoo_results
1693                 self._download_n_results(query, n)
1694                 return
1695             except ValueError: # parsing prefix as integer fails
1696                 self._download_n_results(query, 1)
1697                 return
1698
1699     def _download_n_results(self, query, n):
1700         """Downloads a specified number of results for a query"""
1701
1702         video_ids = []
1703         already_seen = set()
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(query, pagenum)
1708             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709             request = compat_urllib_request.Request(result_url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read()
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 video_id = mobj.group(1)
1719                 if video_id not in already_seen:
1720                     video_ids.append(video_id)
1721                     already_seen.add(video_id)
1722                     if len(video_ids) == n:
1723                         # Specified n videos reached
1724                         for id in video_ids:
1725                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1726                         return
1727
1728             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729                 for id in video_ids:
1730                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                 return
1732
1733             pagenum = pagenum + 1
1734
1735
1736 class YoutubePlaylistIE(InfoExtractor):
1737     """Information Extractor for YouTube playlists."""
1738
1739     _VALID_URL = r"""(?:
1740                         (?:https?://)?
1741                         (?:\w+\.)?
1742                         youtube\.com/
1743                         (?:
1744                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745                            \? (?:.*?&)*? (?:p|a|list)=
1746                         |  p/
1747                         )
1748                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1749                         .*
1750                      |
1751                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1752                      )"""
1753     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1754     _MAX_RESULTS = 50
1755     IE_NAME = u'youtube:playlist'
1756
1757     def __init__(self, downloader=None):
1758         InfoExtractor.__init__(self, downloader)
1759
1760     @classmethod
1761     def suitable(cls, url):
1762         """Receives a URL and returns True if suitable for this IE."""
1763         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1764
1765     def report_download_page(self, playlist_id, pagenum):
1766         """Report attempt to download playlist page with given number."""
1767         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1768
1769     def _real_extract(self, url):
1770         # Extract playlist id
1771         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1772         if mobj is None:
1773             self._downloader.report_error(u'invalid url: %s' % url)
1774             return
1775
1776         # Download playlist videos from API
1777         playlist_id = mobj.group(1) or mobj.group(2)
1778         page_num = 1
1779         videos = []
1780
1781         while True:
1782             self.report_download_page(playlist_id, page_num)
1783
1784             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1785             try:
1786                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             try:
1792                 response = json.loads(page)
1793             except ValueError as err:
1794                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1795                 return
1796
1797             if 'feed' not in response:
1798                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1799                 return
1800             if 'entry' not in response['feed']:
1801                 # Number of videos is a multiple of self._MAX_RESULTS
1802                 break
1803
1804             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1805                         for entry in response['feed']['entry']
1806                         if 'content' in entry ]
1807
1808             if len(response['feed']['entry']) < self._MAX_RESULTS:
1809                 break
1810             page_num += 1
1811
1812         videos = [v[1] for v in sorted(videos)]
1813
1814         url_results = [self.url_result(url) for url in videos]
1815         return [self.playlist_result(url_results, playlist_id)]
1816
1817
1818 class YoutubeChannelIE(InfoExtractor):
1819     """Information Extractor for YouTube channels."""
1820
1821     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1822     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1823     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1824     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1825     IE_NAME = u'youtube:channel'
1826
1827     def report_download_page(self, channel_id, pagenum):
1828         """Report attempt to download channel page with given number."""
1829         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1830
1831     def extract_videos_from_page(self, page):
1832         ids_in_page = []
1833         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1834             if mobj.group(1) not in ids_in_page:
1835                 ids_in_page.append(mobj.group(1))
1836         return ids_in_page
1837
1838     def _real_extract(self, url):
1839         # Extract channel id
1840         mobj = re.match(self._VALID_URL, url)
1841         if mobj is None:
1842             self._downloader.report_error(u'invalid url: %s' % url)
1843             return
1844
1845         # Download channel page
1846         channel_id = mobj.group(1)
1847         video_ids = []
1848         pagenum = 1
1849
1850         self.report_download_page(channel_id, pagenum)
1851         url = self._TEMPLATE_URL % (channel_id, pagenum)
1852         request = compat_urllib_request.Request(url)
1853         try:
1854             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1855         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1857             return
1858
1859         # Extract video identifiers
1860         ids_in_page = self.extract_videos_from_page(page)
1861         video_ids.extend(ids_in_page)
1862
1863         # Download any subsequent channel pages using the json-based channel_ajax query
1864         if self._MORE_PAGES_INDICATOR in page:
1865             while True:
1866                 pagenum = pagenum + 1
1867
1868                 self.report_download_page(channel_id, pagenum)
1869                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1870                 request = compat_urllib_request.Request(url)
1871                 try:
1872                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1873                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1875                     return
1876
1877                 page = json.loads(page)
1878
1879                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1880                 video_ids.extend(ids_in_page)
1881
1882                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1883                     break
1884
1885         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1886
1887         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1888         url_entries = [self.url_result(url) for url in urls]
1889         return [self.playlist_result(url_entries, channel_id)]
1890
1891
1892 class YoutubeUserIE(InfoExtractor):
1893     """Information Extractor for YouTube users."""
1894
1895     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1896     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1897     _GDATA_PAGE_SIZE = 50
1898     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1899     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1900     IE_NAME = u'youtube:user'
1901
1902     def __init__(self, downloader=None):
1903         InfoExtractor.__init__(self, downloader)
1904
1905     def report_download_page(self, username, start_index):
1906         """Report attempt to download user page."""
1907         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1908                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1909
1910     def _real_extract(self, url):
1911         # Extract username
1912         mobj = re.match(self._VALID_URL, url)
1913         if mobj is None:
1914             self._downloader.report_error(u'invalid url: %s' % url)
1915             return
1916
1917         username = mobj.group(1)
1918
1919         # Download video ids using YouTube Data API. Result size per
1920         # query is limited (currently to 50 videos) so we need to query
1921         # page by page until there are no video ids - it means we got
1922         # all of them.
1923
1924         video_ids = []
1925         pagenum = 0
1926
1927         while True:
1928             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1929             self.report_download_page(username, start_index)
1930
1931             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1932
1933             try:
1934                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1935             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1936                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1937                 return
1938
1939             # Extract video identifiers
1940             ids_in_page = []
1941
1942             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1943                 if mobj.group(1) not in ids_in_page:
1944                     ids_in_page.append(mobj.group(1))
1945
1946             video_ids.extend(ids_in_page)
1947
1948             # A little optimization - if current page is not
1949             # "full", ie. does not contain PAGE_SIZE video ids then
1950             # we can assume that this page is the last one - there
1951             # are no more ids on further pages - no need to query
1952             # again.
1953
1954             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1955                 break
1956
1957             pagenum += 1
1958
1959         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1960         url_results = [self.url_result(url) for url in urls]
1961         return [self.playlist_result(url_results, playlist_title = username)]
1962
1963
1964 class BlipTVUserIE(InfoExtractor):
1965     """Information Extractor for blip.tv users."""
1966
1967     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1968     _PAGE_SIZE = 12
1969     IE_NAME = u'blip.tv:user'
1970
1971     def __init__(self, downloader=None):
1972         InfoExtractor.__init__(self, downloader)
1973
1974     def report_download_page(self, username, pagenum):
1975         """Report attempt to download user page."""
1976         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1977                 (self.IE_NAME, username, pagenum))
1978
1979     def _real_extract(self, url):
1980         # Extract username
1981         mobj = re.match(self._VALID_URL, url)
1982         if mobj is None:
1983             self._downloader.report_error(u'invalid url: %s' % url)
1984             return
1985
1986         username = mobj.group(1)
1987
1988         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1989
1990         request = compat_urllib_request.Request(url)
1991
1992         try:
1993             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1994             mobj = re.search(r'data-users-id="([^"]+)"', page)
1995             page_base = page_base % mobj.group(1)
1996         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1998             return
1999
2000
2001         # Download video ids using BlipTV Ajax calls. Result size per
2002         # query is limited (currently to 12 videos) so we need to query
2003         # page by page until there are no video ids - it means we got
2004         # all of them.
2005
2006         video_ids = []
2007         pagenum = 1
2008
2009         while True:
2010             self.report_download_page(username, pagenum)
2011             url = page_base + "&page=" + str(pagenum)
2012             request = compat_urllib_request.Request( url )
2013             try:
2014                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2015             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2017                 return
2018
2019             # Extract video identifiers
2020             ids_in_page = []
2021
2022             for mobj in re.finditer(r'href="/([^"]+)"', page):
2023                 if mobj.group(1) not in ids_in_page:
2024                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2025
2026             video_ids.extend(ids_in_page)
2027
2028             # A little optimization - if current page is not
2029             # "full", ie. does not contain PAGE_SIZE video ids then
2030             # we can assume that this page is the last one - there
2031             # are no more ids on further pages - no need to query
2032             # again.
2033
2034             if len(ids_in_page) < self._PAGE_SIZE:
2035                 break
2036
2037             pagenum += 1
2038
2039         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2040                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2041
2042         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2043         url_entries = [self.url_result(url) for url in urls]
2044         return [self.playlist_result(url_entries, playlist_title = username)]
2045
2046
2047 class DepositFilesIE(InfoExtractor):
2048     """Information extractor for depositfiles.com"""
2049
2050     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2051
2052     def report_download_webpage(self, file_id):
2053         """Report webpage download."""
2054         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2055
2056     def report_extraction(self, file_id):
2057         """Report information extraction."""
2058         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2059
2060     def _real_extract(self, url):
2061         file_id = url.split('/')[-1]
2062         # Rebuild url in english locale
2063         url = 'http://depositfiles.com/en/files/' + file_id
2064
2065         # Retrieve file webpage with 'Free download' button pressed
2066         free_download_indication = { 'gateway_result' : '1' }
2067         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2068         try:
2069             self.report_download_webpage(file_id)
2070             webpage = compat_urllib_request.urlopen(request).read()
2071         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2073             return
2074
2075         # Search for the real file URL
2076         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2077         if (mobj is None) or (mobj.group(1) is None):
2078             # Try to figure out reason of the error.
2079             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2080             if (mobj is not None) and (mobj.group(1) is not None):
2081                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2082                 self._downloader.report_error(u'%s' % restriction_message)
2083             else:
2084                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2085             return
2086
2087         file_url = mobj.group(1)
2088         file_extension = os.path.splitext(file_url)[1][1:]
2089
2090         # Search for file title
2091         mobj = re.search(r'<b title="(.*?)">', webpage)
2092         if mobj is None:
2093             self._downloader.report_error(u'unable to extract title')
2094             return
2095         file_title = mobj.group(1).decode('utf-8')
2096
2097         return [{
2098             'id':       file_id.decode('utf-8'),
2099             'url':      file_url.decode('utf-8'),
2100             'uploader': None,
2101             'upload_date':  None,
2102             'title':    file_title,
2103             'ext':      file_extension.decode('utf-8'),
2104         }]
2105
2106
2107 class FacebookIE(InfoExtractor):
2108     """Information Extractor for Facebook"""
2109
2110     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2111     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2112     _NETRC_MACHINE = 'facebook'
2113     IE_NAME = u'facebook'
2114
2115     def report_login(self):
2116         """Report attempt to log in."""
2117         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2118
2119     def _real_initialize(self):
2120         if self._downloader is None:
2121             return
2122
2123         useremail = None
2124         password = None
2125         downloader_params = self._downloader.params
2126
2127         # Attempt to use provided username and password or .netrc data
2128         if downloader_params.get('username', None) is not None:
2129             useremail = downloader_params['username']
2130             password = downloader_params['password']
2131         elif downloader_params.get('usenetrc', False):
2132             try:
2133                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2134                 if info is not None:
2135                     useremail = info[0]
2136                     password = info[2]
2137                 else:
2138                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2139             except (IOError, netrc.NetrcParseError) as err:
2140                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2141                 return
2142
2143         if useremail is None:
2144             return
2145
2146         # Log in
2147         login_form = {
2148             'email': useremail,
2149             'pass': password,
2150             'login': 'Log+In'
2151             }
2152         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2153         try:
2154             self.report_login()
2155             login_results = compat_urllib_request.urlopen(request).read()
2156             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2157                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2158                 return
2159         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2160             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2161             return
2162
2163     def _real_extract(self, url):
2164         mobj = re.match(self._VALID_URL, url)
2165         if mobj is None:
2166             self._downloader.report_error(u'invalid URL: %s' % url)
2167             return
2168         video_id = mobj.group('ID')
2169
2170         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2171         webpage = self._download_webpage(url, video_id)
2172
2173         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2174         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2175         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2176         if not m:
2177             raise ExtractorError(u'Cannot parse data')
2178         data = dict(json.loads(m.group(1)))
2179         params_raw = compat_urllib_parse.unquote(data['params'])
2180         params = json.loads(params_raw)
2181         video_data = params['video_data'][0]
2182         video_url = video_data.get('hd_src')
2183         if not video_url:
2184             video_url = video_data['sd_src']
2185         if not video_url:
2186             raise ExtractorError(u'Cannot find video URL')
2187         video_duration = int(video_data['video_duration'])
2188         thumbnail = video_data['thumbnail_src']
2189
2190         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2191         if not m:
2192             raise ExtractorError(u'Cannot find title in webpage')
2193         video_title = unescapeHTML(m.group(1))
2194
2195         info = {
2196             'id': video_id,
2197             'title': video_title,
2198             'url': video_url,
2199             'ext': 'mp4',
2200             'duration': video_duration,
2201             'thumbnail': thumbnail,
2202         }
2203         return [info]
2204
2205
2206 class BlipTVIE(InfoExtractor):
2207     """Information extractor for blip.tv"""
2208
2209     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2210     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2211     IE_NAME = u'blip.tv'
2212
2213     def report_extraction(self, file_id):
2214         """Report information extraction."""
2215         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2216
2217     def report_direct_download(self, title):
2218         """Report information extraction."""
2219         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2220
2221     def _real_extract(self, url):
2222         mobj = re.match(self._VALID_URL, url)
2223         if mobj is None:
2224             self._downloader.report_error(u'invalid URL: %s' % url)
2225             return
2226
2227         urlp = compat_urllib_parse_urlparse(url)
2228         if urlp.path.startswith('/play/'):
2229             request = compat_urllib_request.Request(url)
2230             response = compat_urllib_request.urlopen(request)
2231             redirecturl = response.geturl()
2232             rurlp = compat_urllib_parse_urlparse(redirecturl)
2233             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2234             url = 'http://blip.tv/a/a-' + file_id
2235             return self._real_extract(url)
2236
2237
2238         if '?' in url:
2239             cchar = '&'
2240         else:
2241             cchar = '?'
2242         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2243         request = compat_urllib_request.Request(json_url)
2244         request.add_header('User-Agent', 'iTunes/10.6.1')
2245         self.report_extraction(mobj.group(1))
2246         info = None
2247         try:
2248             urlh = compat_urllib_request.urlopen(request)
2249             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2250                 basename = url.split('/')[-1]
2251                 title,ext = os.path.splitext(basename)
2252                 title = title.decode('UTF-8')
2253                 ext = ext.replace('.', '')
2254                 self.report_direct_download(title)
2255                 info = {
2256                     'id': title,
2257                     'url': url,
2258                     'uploader': None,
2259                     'upload_date': None,
2260                     'title': title,
2261                     'ext': ext,
2262                     'urlhandle': urlh
2263                 }
2264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2265             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2266         if info is None: # Regular URL
2267             try:
2268                 json_code_bytes = urlh.read()
2269                 json_code = json_code_bytes.decode('utf-8')
2270             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2271                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2272                 return
2273
2274             try:
2275                 json_data = json.loads(json_code)
2276                 if 'Post' in json_data:
2277                     data = json_data['Post']
2278                 else:
2279                     data = json_data
2280
2281                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2282                 video_url = data['media']['url']
2283                 umobj = re.match(self._URL_EXT, video_url)
2284                 if umobj is None:
2285                     raise ValueError('Can not determine filename extension')
2286                 ext = umobj.group(1)
2287
2288                 info = {
2289                     'id': data['item_id'],
2290                     'url': video_url,
2291                     'uploader': data['display_name'],
2292                     'upload_date': upload_date,
2293                     'title': data['title'],
2294                     'ext': ext,
2295                     'format': data['media']['mimeType'],
2296                     'thumbnail': data['thumbnailUrl'],
2297                     'description': data['description'],
2298                     'player_url': data['embedUrl'],
2299                     'user_agent': 'iTunes/10.6.1',
2300                 }
2301             except (ValueError,KeyError) as err:
2302                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2303                 return
2304
2305         return [info]
2306
2307
2308 class MyVideoIE(InfoExtractor):
2309     """Information Extractor for myvideo.de."""
2310
2311     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2312     IE_NAME = u'myvideo'
2313
2314     def __init__(self, downloader=None):
2315         InfoExtractor.__init__(self, downloader)
2316
2317     def report_extraction(self, video_id):
2318         """Report information extraction."""
2319         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2320
2321     def _real_extract(self,url):
2322         mobj = re.match(self._VALID_URL, url)
2323         if mobj is None:
2324             self._download.report_error(u'invalid URL: %s' % url)
2325             return
2326
2327         video_id = mobj.group(1)
2328
2329         # Get video webpage
2330         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2331         webpage = self._download_webpage(webpage_url, video_id)
2332
2333         self.report_extraction(video_id)
2334         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2335                  webpage)
2336         if mobj is None:
2337             self._downloader.report_error(u'unable to extract media URL')
2338             return
2339         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2340
2341         mobj = re.search('<title>([^<]+)</title>', webpage)
2342         if mobj is None:
2343             self._downloader.report_error(u'unable to extract title')
2344             return
2345
2346         video_title = mobj.group(1)
2347
2348         return [{
2349             'id':       video_id,
2350             'url':      video_url,
2351             'uploader': None,
2352             'upload_date':  None,
2353             'title':    video_title,
2354             'ext':      u'flv',
2355         }]
2356
2357 class ComedyCentralIE(InfoExtractor):
2358     """Information extractor for The Daily Show and Colbert Report """
2359
2360     # urls can be abbreviations like :thedailyshow or :colbert
2361     # urls for episodes like:
2362     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2363     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2364     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2365     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2366                       |(https?://)?(www\.)?
2367                           (?P<showname>thedailyshow|colbertnation)\.com/
2368                          (full-episodes/(?P<episode>.*)|
2369                           (?P<clip>
2370                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2371                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2372                      $"""
2373
2374     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2375
2376     _video_extensions = {
2377         '3500': 'mp4',
2378         '2200': 'mp4',
2379         '1700': 'mp4',
2380         '1200': 'mp4',
2381         '750': 'mp4',
2382         '400': 'mp4',
2383     }
2384     _video_dimensions = {
2385         '3500': '1280x720',
2386         '2200': '960x540',
2387         '1700': '768x432',
2388         '1200': '640x360',
2389         '750': '512x288',
2390         '400': '384x216',
2391     }
2392
2393     @classmethod
2394     def suitable(cls, url):
2395         """Receives a URL and returns True if suitable for this IE."""
2396         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2397
2398     def report_extraction(self, episode_id):
2399         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2400
2401     def report_config_download(self, episode_id, media_id):
2402         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2403
2404     def report_index_download(self, episode_id):
2405         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2406
2407     def _print_formats(self, formats):
2408         print('Available formats:')
2409         for x in formats:
2410             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2411
2412
2413     def _real_extract(self, url):
2414         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2415         if mobj is None:
2416             self._downloader.report_error(u'invalid URL: %s' % url)
2417             return
2418
2419         if mobj.group('shortname'):
2420             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2421                 url = u'http://www.thedailyshow.com/full-episodes/'
2422             else:
2423                 url = u'http://www.colbertnation.com/full-episodes/'
2424             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2425             assert mobj is not None
2426
2427         if mobj.group('clip'):
2428             if mobj.group('showname') == 'thedailyshow':
2429                 epTitle = mobj.group('tdstitle')
2430             else:
2431                 epTitle = mobj.group('cntitle')
2432             dlNewest = False
2433         else:
2434             dlNewest = not mobj.group('episode')
2435             if dlNewest:
2436                 epTitle = mobj.group('showname')
2437             else:
2438                 epTitle = mobj.group('episode')
2439
2440         req = compat_urllib_request.Request(url)
2441         self.report_extraction(epTitle)
2442         try:
2443             htmlHandle = compat_urllib_request.urlopen(req)
2444             html = htmlHandle.read()
2445             webpage = html.decode('utf-8')
2446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2448             return
2449         if dlNewest:
2450             url = htmlHandle.geturl()
2451             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2452             if mobj is None:
2453                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2454                 return
2455             if mobj.group('episode') == '':
2456                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2457                 return
2458             epTitle = mobj.group('episode')
2459
2460         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2461
2462         if len(mMovieParams) == 0:
2463             # The Colbert Report embeds the information in a without
2464             # a URL prefix; so extract the alternate reference
2465             # and then add the URL prefix manually.
2466
2467             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2468             if len(altMovieParams) == 0:
2469                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2470                 return
2471             else:
2472                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2473
2474         uri = mMovieParams[0][1]
2475         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2476         self.report_index_download(epTitle)
2477         try:
2478             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2479         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2481             return
2482
2483         results = []
2484
2485         idoc = xml.etree.ElementTree.fromstring(indexXml)
2486         itemEls = idoc.findall('.//item')
2487         for partNum,itemEl in enumerate(itemEls):
2488             mediaId = itemEl.findall('./guid')[0].text
2489             shortMediaId = mediaId.split(':')[-1]
2490             showId = mediaId.split(':')[-2].replace('.com', '')
2491             officialTitle = itemEl.findall('./title')[0].text
2492             officialDate = itemEl.findall('./pubDate')[0].text
2493
2494             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2495                         compat_urllib_parse.urlencode({'uri': mediaId}))
2496             configReq = compat_urllib_request.Request(configUrl)
2497             self.report_config_download(epTitle, shortMediaId)
2498             try:
2499                 configXml = compat_urllib_request.urlopen(configReq).read()
2500             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2501                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2502                 return
2503
2504             cdoc = xml.etree.ElementTree.fromstring(configXml)
2505             turls = []
2506             for rendition in cdoc.findall('.//rendition'):
2507                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2508                 turls.append(finfo)
2509
2510             if len(turls) == 0:
2511                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2512                 continue
2513
2514             if self._downloader.params.get('listformats', None):
2515                 self._print_formats([i[0] for i in turls])
2516                 return
2517
2518             # For now, just pick the highest bitrate
2519             format,rtmp_video_url = turls[-1]
2520
2521             # Get the format arg from the arg stream
2522             req_format = self._downloader.params.get('format', None)
2523
2524             # Select format if we can find one
2525             for f,v in turls:
2526                 if f == req_format:
2527                     format, rtmp_video_url = f, v
2528                     break
2529
2530             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2531             if not m:
2532                 raise ExtractorError(u'Cannot transform RTMP url')
2533             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2534             video_url = base + m.group('finalid')
2535
2536             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2537             info = {
2538                 'id': shortMediaId,
2539                 'url': video_url,
2540                 'uploader': showId,
2541                 'upload_date': officialDate,
2542                 'title': effTitle,
2543                 'ext': 'mp4',
2544                 'format': format,
2545                 'thumbnail': None,
2546                 'description': officialTitle,
2547             }
2548             results.append(info)
2549
2550         return results
2551
2552
2553 class EscapistIE(InfoExtractor):
2554     """Information extractor for The Escapist """
2555
2556     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2557     IE_NAME = u'escapist'
2558
2559     def report_extraction(self, showName):
2560         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2561
2562     def report_config_download(self, showName):
2563         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2564
2565     def _real_extract(self, url):
2566         mobj = re.match(self._VALID_URL, url)
2567         if mobj is None:
2568             self._downloader.report_error(u'invalid URL: %s' % url)
2569             return
2570         showName = mobj.group('showname')
2571         videoId = mobj.group('episode')
2572
2573         self.report_extraction(showName)
2574         try:
2575             webPage = compat_urllib_request.urlopen(url)
2576             webPageBytes = webPage.read()
2577             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2578             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2581             return
2582
2583         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2584         description = unescapeHTML(descMatch.group(1))
2585         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2586         imgUrl = unescapeHTML(imgMatch.group(1))
2587         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2588         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2589         configUrlMatch = re.search('config=(.*)$', playerUrl)
2590         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2591
2592         self.report_config_download(showName)
2593         try:
2594             configJSON = compat_urllib_request.urlopen(configUrl)
2595             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2596             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2598             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2599             return
2600
2601         # Technically, it's JavaScript, not JSON
2602         configJSON = configJSON.replace("'", '"')
2603
2604         try:
2605             config = json.loads(configJSON)
2606         except (ValueError,) as err:
2607             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2608             return
2609
2610         playlist = config['playlist']
2611         videoUrl = playlist[1]['url']
2612
2613         info = {
2614             'id': videoId,
2615             'url': videoUrl,
2616             'uploader': showName,
2617             'upload_date': None,
2618             'title': showName,
2619             'ext': 'mp4',
2620             'thumbnail': imgUrl,
2621             'description': description,
2622             'player_url': playerUrl,
2623         }
2624
2625         return [info]
2626
2627 class CollegeHumorIE(InfoExtractor):
2628     """Information extractor for collegehumor.com"""
2629
2630     _WORKING = False
2631     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2632     IE_NAME = u'collegehumor'
2633
2634     def report_manifest(self, video_id):
2635         """Report information extraction."""
2636         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2637
2638     def report_extraction(self, video_id):
2639         """Report information extraction."""
2640         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2641
2642     def _real_extract(self, url):
2643         mobj = re.match(self._VALID_URL, url)
2644         if mobj is None:
2645             self._downloader.report_error(u'invalid URL: %s' % url)
2646             return
2647         video_id = mobj.group('videoid')
2648
2649         info = {
2650             'id': video_id,
2651             'uploader': None,
2652             'upload_date': None,
2653         }
2654
2655         self.report_extraction(video_id)
2656         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2657         try:
2658             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2660             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2661             return
2662
2663         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2664         try:
2665             videoNode = mdoc.findall('./video')[0]
2666             info['description'] = videoNode.findall('./description')[0].text
2667             info['title'] = videoNode.findall('./caption')[0].text
2668             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2669             manifest_url = videoNode.findall('./file')[0].text
2670         except IndexError:
2671             self._downloader.report_error(u'Invalid metadata XML file')
2672             return
2673
2674         manifest_url += '?hdcore=2.10.3'
2675         self.report_manifest(video_id)
2676         try:
2677             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2678         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2679             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2680             return
2681
2682         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2683         try:
2684             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2685             node_id = media_node.attrib['url']
2686             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2687         except IndexError as err:
2688             self._downloader.report_error(u'Invalid manifest file')
2689             return
2690
2691         url_pr = compat_urllib_parse_urlparse(manifest_url)
2692         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2693
2694         info['url'] = url
2695         info['ext'] = 'f4f'
2696         return [info]
2697
2698
2699 class XVideosIE(InfoExtractor):
2700     """Information extractor for xvideos.com"""
2701
2702     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2703     IE_NAME = u'xvideos'
2704
2705     def report_extraction(self, video_id):
2706         """Report information extraction."""
2707         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2708
2709     def _real_extract(self, url):
2710         mobj = re.match(self._VALID_URL, url)
2711         if mobj is None:
2712             self._downloader.report_error(u'invalid URL: %s' % url)
2713             return
2714         video_id = mobj.group(1)
2715
2716         webpage = self._download_webpage(url, video_id)
2717
2718         self.report_extraction(video_id)
2719
2720
2721         # Extract video URL
2722         mobj = re.search(r'flv_url=(.+?)&', webpage)
2723         if mobj is None:
2724             self._downloader.report_error(u'unable to extract video url')
2725             return
2726         video_url = compat_urllib_parse.unquote(mobj.group(1))
2727
2728
2729         # Extract title
2730         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2731         if mobj is None:
2732             self._downloader.report_error(u'unable to extract video title')
2733             return
2734         video_title = mobj.group(1)
2735
2736
2737         # Extract video thumbnail
2738         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2739         if mobj is None:
2740             self._downloader.report_error(u'unable to extract video thumbnail')
2741             return
2742         video_thumbnail = mobj.group(0)
2743
2744         info = {
2745             'id': video_id,
2746             'url': video_url,
2747             'uploader': None,
2748             'upload_date': None,
2749             'title': video_title,
2750             'ext': 'flv',
2751             'thumbnail': video_thumbnail,
2752             'description': None,
2753         }
2754
2755         return [info]
2756
2757
2758 class SoundcloudIE(InfoExtractor):
2759     """Information extractor for soundcloud.com
2760        To access the media, the uid of the song and a stream token
2761        must be extracted from the page source and the script must make
2762        a request to media.soundcloud.com/crossdomain.xml. Then
2763        the media can be grabbed by requesting from an url composed
2764        of the stream token and uid
2765      """
2766
2767     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2768     IE_NAME = u'soundcloud'
2769
2770     def __init__(self, downloader=None):
2771         InfoExtractor.__init__(self, downloader)
2772
2773     def report_resolve(self, video_id):
2774         """Report information extraction."""
2775         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2776
2777     def report_extraction(self, video_id):
2778         """Report information extraction."""
2779         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2780
2781     def _real_extract(self, url):
2782         mobj = re.match(self._VALID_URL, url)
2783         if mobj is None:
2784             self._downloader.report_error(u'invalid URL: %s' % url)
2785             return
2786
2787         # extract uploader (which is in the url)
2788         uploader = mobj.group(1)
2789         # extract simple title (uploader + slug of song title)
2790         slug_title =  mobj.group(2)
2791         simple_title = uploader + u'-' + slug_title
2792
2793         self.report_resolve('%s/%s' % (uploader, slug_title))
2794
2795         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2796         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2797         request = compat_urllib_request.Request(resolv_url)
2798         try:
2799             info_json_bytes = compat_urllib_request.urlopen(request).read()
2800             info_json = info_json_bytes.decode('utf-8')
2801         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2802             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2803             return
2804
2805         info = json.loads(info_json)
2806         video_id = info['id']
2807         self.report_extraction('%s/%s' % (uploader, slug_title))
2808
2809         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810         request = compat_urllib_request.Request(streams_url)
2811         try:
2812             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2813             stream_json = stream_json_bytes.decode('utf-8')
2814         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2816             return
2817
2818         streams = json.loads(stream_json)
2819         mediaURL = streams['http_mp3_128_url']
2820
2821         return [{
2822             'id':       info['id'],
2823             'url':      mediaURL,
2824             'uploader': info['user']['username'],
2825             'upload_date':  info['created_at'],
2826             'title':    info['title'],
2827             'ext':      u'mp3',
2828             'description': info['description'],
2829         }]
2830
2831 class SoundcloudSetIE(InfoExtractor):
2832     """Information extractor for soundcloud.com sets
2833        To access the media, the uid of the song and a stream token
2834        must be extracted from the page source and the script must make
2835        a request to media.soundcloud.com/crossdomain.xml. Then
2836        the media can be grabbed by requesting from an url composed
2837        of the stream token and uid
2838      """
2839
2840     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2841     IE_NAME = u'soundcloud'
2842
2843     def __init__(self, downloader=None):
2844         InfoExtractor.__init__(self, downloader)
2845
2846     def report_resolve(self, video_id):
2847         """Report information extraction."""
2848         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2849
2850     def report_extraction(self, video_id):
2851         """Report information extraction."""
2852         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2853
2854     def _real_extract(self, url):
2855         mobj = re.match(self._VALID_URL, url)
2856         if mobj is None:
2857             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2858             return
2859
2860         # extract uploader (which is in the url)
2861         uploader = mobj.group(1)
2862         # extract simple title (uploader + slug of song title)
2863         slug_title =  mobj.group(2)
2864         simple_title = uploader + u'-' + slug_title
2865
2866         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2867
2868         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2869         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2870         request = compat_urllib_request.Request(resolv_url)
2871         try:
2872             info_json_bytes = compat_urllib_request.urlopen(request).read()
2873             info_json = info_json_bytes.decode('utf-8')
2874         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2876             return
2877
2878         videos = []
2879         info = json.loads(info_json)
2880         if 'errors' in info:
2881             for err in info['errors']:
2882                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2883             return
2884
2885         for track in info['tracks']:
2886             video_id = track['id']
2887             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2888
2889             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2890             request = compat_urllib_request.Request(streams_url)
2891             try:
2892                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2893                 stream_json = stream_json_bytes.decode('utf-8')
2894             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2896                 return
2897
2898             streams = json.loads(stream_json)
2899             mediaURL = streams['http_mp3_128_url']
2900
2901             videos.append({
2902                 'id':       video_id,
2903                 'url':      mediaURL,
2904                 'uploader': track['user']['username'],
2905                 'upload_date':  track['created_at'],
2906                 'title':    track['title'],
2907                 'ext':      u'mp3',
2908                 'description': track['description'],
2909             })
2910         return videos
2911
2912
2913 class InfoQIE(InfoExtractor):
2914     """Information extractor for infoq.com"""
2915     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2916
2917     def report_extraction(self, video_id):
2918         """Report information extraction."""
2919         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2920
2921     def _real_extract(self, url):
2922         mobj = re.match(self._VALID_URL, url)
2923         if mobj is None:
2924             self._downloader.report_error(u'invalid URL: %s' % url)
2925             return
2926
2927         webpage = self._download_webpage(url, video_id=url)
2928         self.report_extraction(url)
2929
2930         # Extract video URL
2931         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2932         if mobj is None:
2933             self._downloader.report_error(u'unable to extract video url')
2934             return
2935         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2936         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2937
2938         # Extract title
2939         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2940         if mobj is None:
2941             self._downloader.report_error(u'unable to extract video title')
2942             return
2943         video_title = mobj.group(1)
2944
2945         # Extract description
2946         video_description = u'No description available.'
2947         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2948         if mobj is not None:
2949             video_description = mobj.group(1)
2950
2951         video_filename = video_url.split('/')[-1]
2952         video_id, extension = video_filename.split('.')
2953
2954         info = {
2955             'id': video_id,
2956             'url': video_url,
2957             'uploader': None,
2958             'upload_date': None,
2959             'title': video_title,
2960             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2961             'thumbnail': None,
2962             'description': video_description,
2963         }
2964
2965         return [info]
2966
2967 class MixcloudIE(InfoExtractor):
2968     """Information extractor for www.mixcloud.com"""
2969
2970     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2971     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2972     IE_NAME = u'mixcloud'
2973
2974     def __init__(self, downloader=None):
2975         InfoExtractor.__init__(self, downloader)
2976
2977     def report_download_json(self, file_id):
2978         """Report JSON download."""
2979         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2980
2981     def report_extraction(self, file_id):
2982         """Report information extraction."""
2983         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2984
2985     def get_urls(self, jsonData, fmt, bitrate='best'):
2986         """Get urls from 'audio_formats' section in json"""
2987         file_url = None
2988         try:
2989             bitrate_list = jsonData[fmt]
2990             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2991                 bitrate = max(bitrate_list) # select highest
2992
2993             url_list = jsonData[fmt][bitrate]
2994         except TypeError: # we have no bitrate info.
2995             url_list = jsonData[fmt]
2996         return url_list
2997
2998     def check_urls(self, url_list):
2999         """Returns 1st active url from list"""
3000         for url in url_list:
3001             try:
3002                 compat_urllib_request.urlopen(url)
3003                 return url
3004             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3005                 url = None
3006
3007         return None
3008
3009     def _print_formats(self, formats):
3010         print('Available formats:')
3011         for fmt in formats.keys():
3012             for b in formats[fmt]:
3013                 try:
3014                     ext = formats[fmt][b][0]
3015                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3016                 except TypeError: # we have no bitrate info
3017                     ext = formats[fmt][0]
3018                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3019                     break
3020
3021     def _real_extract(self, url):
3022         mobj = re.match(self._VALID_URL, url)
3023         if mobj is None:
3024             self._downloader.report_error(u'invalid URL: %s' % url)
3025             return
3026         # extract uploader & filename from url
3027         uploader = mobj.group(1).decode('utf-8')
3028         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3029
3030         # construct API request
3031         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3032         # retrieve .json file with links to files
3033         request = compat_urllib_request.Request(file_url)
3034         try:
3035             self.report_download_json(file_url)
3036             jsonData = compat_urllib_request.urlopen(request).read()
3037         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3038             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3039             return
3040
3041         # parse JSON
3042         json_data = json.loads(jsonData)
3043         player_url = json_data['player_swf_url']
3044         formats = dict(json_data['audio_formats'])
3045
3046         req_format = self._downloader.params.get('format', None)
3047         bitrate = None
3048
3049         if self._downloader.params.get('listformats', None):
3050             self._print_formats(formats)
3051             return
3052
3053         if req_format is None or req_format == 'best':
3054             for format_param in formats.keys():
3055                 url_list = self.get_urls(formats, format_param)
3056                 # check urls
3057                 file_url = self.check_urls(url_list)
3058                 if file_url is not None:
3059                     break # got it!
3060         else:
3061             if req_format not in formats:
3062                 self._downloader.report_error(u'format is not available')
3063                 return
3064
3065             url_list = self.get_urls(formats, req_format)
3066             file_url = self.check_urls(url_list)
3067             format_param = req_format
3068
3069         return [{
3070             'id': file_id.decode('utf-8'),
3071             'url': file_url.decode('utf-8'),
3072             'uploader': uploader.decode('utf-8'),
3073             'upload_date': None,
3074             'title': json_data['name'],
3075             'ext': file_url.split('.')[-1].decode('utf-8'),
3076             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3077             'thumbnail': json_data['thumbnail_url'],
3078             'description': json_data['description'],
3079             'player_url': player_url.decode('utf-8'),
3080         }]
3081
3082 class StanfordOpenClassroomIE(InfoExtractor):
3083     """Information extractor for Stanford's Open ClassRoom"""
3084
3085     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3086     IE_NAME = u'stanfordoc'
3087
3088     def report_download_webpage(self, objid):
3089         """Report information extraction."""
3090         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3091
3092     def report_extraction(self, video_id):
3093         """Report information extraction."""
3094         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3095
3096     def _real_extract(self, url):
3097         mobj = re.match(self._VALID_URL, url)
3098         if mobj is None:
3099             raise ExtractorError(u'Invalid URL: %s' % url)
3100
3101         if mobj.group('course') and mobj.group('video'): # A specific video
3102             course = mobj.group('course')
3103             video = mobj.group('video')
3104             info = {
3105                 'id': course + '_' + video,
3106                 'uploader': None,
3107                 'upload_date': None,
3108             }
3109
3110             self.report_extraction(info['id'])
3111             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3112             xmlUrl = baseUrl + video + '.xml'
3113             try:
3114                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3115             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3116                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3117                 return
3118             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3119             try:
3120                 info['title'] = mdoc.findall('./title')[0].text
3121                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3122             except IndexError:
3123                 self._downloader.report_error(u'Invalid metadata XML file')
3124                 return
3125             info['ext'] = info['url'].rpartition('.')[2]
3126             return [info]
3127         elif mobj.group('course'): # A course page
3128             course = mobj.group('course')
3129             info = {
3130                 'id': course,
3131                 'type': 'playlist',
3132                 'uploader': None,
3133                 'upload_date': None,
3134             }
3135
3136             coursepage = self._download_webpage(url, info['id'],
3137                                         note='Downloading course info page',
3138                                         errnote='Unable to download course info page')
3139
3140             m = re.search('<h1>([^<]+)</h1>', coursepage)
3141             if m:
3142                 info['title'] = unescapeHTML(m.group(1))
3143             else:
3144                 info['title'] = info['id']
3145
3146             m = re.search('<description>([^<]+)</description>', coursepage)
3147             if m:
3148                 info['description'] = unescapeHTML(m.group(1))
3149
3150             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3151             info['list'] = [
3152                 {
3153                     'type': 'reference',
3154                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3155                 }
3156                     for vpage in links]
3157             results = []
3158             for entry in info['list']:
3159                 assert entry['type'] == 'reference'
3160                 results += self.extract(entry['url'])
3161             return results
3162         else: # Root page
3163             info = {
3164                 'id': 'Stanford OpenClassroom',
3165                 'type': 'playlist',
3166                 'uploader': None,
3167                 'upload_date': None,
3168             }
3169
3170             self.report_download_webpage(info['id'])
3171             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3172             try:
3173                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3174             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3175                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3176                 return
3177
3178             info['title'] = info['id']
3179
3180             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3181             info['list'] = [
3182                 {
3183                     'type': 'reference',
3184                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3185                 }
3186                     for cpage in links]
3187
3188             results = []
3189             for entry in info['list']:
3190                 assert entry['type'] == 'reference'
3191                 results += self.extract(entry['url'])
3192             return results
3193
3194 class MTVIE(InfoExtractor):
3195     """Information extractor for MTV.com"""
3196
3197     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3198     IE_NAME = u'mtv'
3199
3200     def report_extraction(self, video_id):
3201         """Report information extraction."""
3202         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3203
3204     def _real_extract(self, url):
3205         mobj = re.match(self._VALID_URL, url)
3206         if mobj is None:
3207             self._downloader.report_error(u'invalid URL: %s' % url)
3208             return
3209         if not mobj.group('proto'):
3210             url = 'http://' + url
3211         video_id = mobj.group('videoid')
3212
3213         webpage = self._download_webpage(url, video_id)
3214
3215         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3216         if mobj is None:
3217             self._downloader.report_error(u'unable to extract song name')
3218             return
3219         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3220         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3221         if mobj is None:
3222             self._downloader.report_error(u'unable to extract performer')
3223             return
3224         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3225         video_title = performer + ' - ' + song_name
3226
3227         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3228         if mobj is None:
3229             self._downloader.report_error(u'unable to mtvn_uri')
3230             return
3231         mtvn_uri = mobj.group(1)
3232
3233         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3234         if mobj is None:
3235             self._downloader.report_error(u'unable to extract content id')
3236             return
3237         content_id = mobj.group(1)
3238
3239         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3240         self.report_extraction(video_id)
3241         request = compat_urllib_request.Request(videogen_url)
3242         try:
3243             metadataXml = compat_urllib_request.urlopen(request).read()
3244         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3245             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3246             return
3247
3248         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3249         renditions = mdoc.findall('.//rendition')
3250
3251         # For now, always pick the highest quality.
3252         rendition = renditions[-1]
3253
3254         try:
3255             _,_,ext = rendition.attrib['type'].partition('/')
3256             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3257             video_url = rendition.find('./src').text
3258         except KeyError:
3259             self._downloader.trouble('Invalid rendition field.')
3260             return
3261
3262         info = {
3263             'id': video_id,
3264             'url': video_url,
3265             'uploader': performer,
3266             'upload_date': None,
3267             'title': video_title,
3268             'ext': ext,
3269             'format': format,
3270         }
3271
3272         return [info]
3273
3274
3275 class YoukuIE(InfoExtractor):
3276     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3277
3278     def report_download_webpage(self, file_id):
3279         """Report webpage download."""
3280         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3281
3282     def report_extraction(self, file_id):
3283         """Report information extraction."""
3284         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3285
3286     def _gen_sid(self):
3287         nowTime = int(time.time() * 1000)
3288         random1 = random.randint(1000,1998)
3289         random2 = random.randint(1000,9999)
3290
3291         return "%d%d%d" %(nowTime,random1,random2)
3292
3293     def _get_file_ID_mix_string(self, seed):
3294         mixed = []
3295         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3296         seed = float(seed)
3297         for i in range(len(source)):
3298             seed  =  (seed * 211 + 30031 ) % 65536
3299             index  =  math.floor(seed / 65536 * len(source) )
3300             mixed.append(source[int(index)])
3301             source.remove(source[int(index)])
3302         #return ''.join(mixed)
3303         return mixed
3304
3305     def _get_file_id(self, fileId, seed):
3306         mixed = self._get_file_ID_mix_string(seed)
3307         ids = fileId.split('*')
3308         realId = []
3309         for ch in ids:
3310             if ch:
3311                 realId.append(mixed[int(ch)])
3312         return ''.join(realId)
3313
3314     def _real_extract(self, url):
3315         mobj = re.match(self._VALID_URL, url)
3316         if mobj is None:
3317             self._downloader.report_error(u'invalid URL: %s' % url)
3318             return
3319         video_id = mobj.group('ID')
3320
3321         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3322
3323         request = compat_urllib_request.Request(info_url, None, std_headers)
3324         try:
3325             self.report_download_webpage(video_id)
3326             jsondata = compat_urllib_request.urlopen(request).read()
3327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3328             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3329             return
3330
3331         self.report_extraction(video_id)
3332         try:
3333             jsonstr = jsondata.decode('utf-8')
3334             config = json.loads(jsonstr)
3335
3336             video_title =  config['data'][0]['title']
3337             seed = config['data'][0]['seed']
3338
3339             format = self._downloader.params.get('format', None)
3340             supported_format = list(config['data'][0]['streamfileids'].keys())
3341
3342             if format is None or format == 'best':
3343                 if 'hd2' in supported_format:
3344                     format = 'hd2'
3345                 else:
3346                     format = 'flv'
3347                 ext = u'flv'
3348             elif format == 'worst':
3349                 format = 'mp4'
3350                 ext = u'mp4'
3351             else:
3352                 format = 'flv'
3353                 ext = u'flv'
3354
3355
3356             fileid = config['data'][0]['streamfileids'][format]
3357             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3358         except (UnicodeDecodeError, ValueError, KeyError):
3359             self._downloader.report_error(u'unable to extract info section')
3360             return
3361
3362         files_info=[]
3363         sid = self._gen_sid()
3364         fileid = self._get_file_id(fileid, seed)
3365
3366         #column 8,9 of fileid represent the segment number
3367         #fileid[7:9] should be changed
3368         for index, key in enumerate(keys):
3369
3370             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3371             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3372
3373             info = {
3374                 'id': '%s_part%02d' % (video_id, index),
3375                 'url': download_url,
3376                 'uploader': None,
3377                 'upload_date': None,
3378                 'title': video_title,
3379                 'ext': ext,
3380             }
3381             files_info.append(info)
3382
3383         return files_info
3384
3385
3386 class XNXXIE(InfoExtractor):
3387     """Information extractor for xnxx.com"""
3388
3389     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3390     IE_NAME = u'xnxx'
3391     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3392     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3393     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3394
3395     def report_webpage(self, video_id):
3396         """Report information extraction"""
3397         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3398
3399     def report_extraction(self, video_id):
3400         """Report information extraction"""
3401         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3402
3403     def _real_extract(self, url):
3404         mobj = re.match(self._VALID_URL, url)
3405         if mobj is None:
3406             self._downloader.report_error(u'invalid URL: %s' % url)
3407             return
3408         video_id = mobj.group(1)
3409
3410         self.report_webpage(video_id)
3411
3412         # Get webpage content
3413         try:
3414             webpage_bytes = compat_urllib_request.urlopen(url).read()
3415             webpage = webpage_bytes.decode('utf-8')
3416         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3417             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3418             return
3419
3420         result = re.search(self.VIDEO_URL_RE, webpage)
3421         if result is None:
3422             self._downloader.report_error(u'unable to extract video url')
3423             return
3424         video_url = compat_urllib_parse.unquote(result.group(1))
3425
3426         result = re.search(self.VIDEO_TITLE_RE, webpage)
3427         if result is None:
3428             self._downloader.report_error(u'unable to extract video title')
3429             return
3430         video_title = result.group(1)
3431
3432         result = re.search(self.VIDEO_THUMB_RE, webpage)
3433         if result is None:
3434             self._downloader.report_error(u'unable to extract video thumbnail')
3435             return
3436         video_thumbnail = result.group(1)
3437
3438         return [{
3439             'id': video_id,
3440             'url': video_url,
3441             'uploader': None,
3442             'upload_date': None,
3443             'title': video_title,
3444             'ext': 'flv',
3445             'thumbnail': video_thumbnail,
3446             'description': None,
3447         }]
3448
3449
3450 class GooglePlusIE(InfoExtractor):
3451     """Information extractor for plus.google.com."""
3452
3453     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3454     IE_NAME = u'plus.google'
3455
3456     def __init__(self, downloader=None):
3457         InfoExtractor.__init__(self, downloader)
3458
3459     def report_extract_entry(self, url):
3460         """Report downloading extry"""
3461         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3462
3463     def report_date(self, upload_date):
3464         """Report downloading extry"""
3465         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3466
3467     def report_uploader(self, uploader):
3468         """Report downloading extry"""
3469         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3470
3471     def report_title(self, video_title):
3472         """Report downloading extry"""
3473         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3474
3475     def report_extract_vid_page(self, video_page):
3476         """Report information extraction."""
3477         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3478
3479     def _real_extract(self, url):
3480         # Extract id from URL
3481         mobj = re.match(self._VALID_URL, url)
3482         if mobj is None:
3483             self._downloader.report_error(u'Invalid URL: %s' % url)
3484             return
3485
3486         post_url = mobj.group(0)
3487         video_id = mobj.group(1)
3488
3489         video_extension = 'flv'
3490
3491         # Step 1, Retrieve post webpage to extract further information
3492         self.report_extract_entry(post_url)
3493         request = compat_urllib_request.Request(post_url)
3494         try:
3495             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3496         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3497             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3498             return
3499
3500         # Extract update date
3501         upload_date = None
3502         pattern = 'title="Timestamp">(.*?)</a>'
3503         mobj = re.search(pattern, webpage)
3504         if mobj:
3505             upload_date = mobj.group(1)
3506             # Convert timestring to a format suitable for filename
3507             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3508             upload_date = upload_date.strftime('%Y%m%d')
3509         self.report_date(upload_date)
3510
3511         # Extract uploader
3512         uploader = None
3513         pattern = r'rel\="author".*?>(.*?)</a>'
3514         mobj = re.search(pattern, webpage)
3515         if mobj:
3516             uploader = mobj.group(1)
3517         self.report_uploader(uploader)
3518
3519         # Extract title
3520         # Get the first line for title
3521         video_title = u'NA'
3522         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3523         mobj = re.search(pattern, webpage)
3524         if mobj:
3525             video_title = mobj.group(1)
3526         self.report_title(video_title)
3527
3528         # Step 2, Stimulate clicking the image box to launch video
3529         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3530         mobj = re.search(pattern, webpage)
3531         if mobj is None:
3532             self._downloader.report_error(u'unable to extract video page URL')
3533
3534         video_page = mobj.group(1)
3535         request = compat_urllib_request.Request(video_page)
3536         try:
3537             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3538         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3539             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3540             return
3541         self.report_extract_vid_page(video_page)
3542
3543
3544         # Extract video links on video page
3545         """Extract video links of all sizes"""
3546         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3547         mobj = re.findall(pattern, webpage)
3548         if len(mobj) == 0:
3549             self._downloader.report_error(u'unable to extract video links')
3550
3551         # Sort in resolution
3552         links = sorted(mobj)
3553
3554         # Choose the lowest of the sort, i.e. highest resolution
3555         video_url = links[-1]
3556         # Only get the url. The resolution part in the tuple has no use anymore
3557         video_url = video_url[-1]
3558         # Treat escaped \u0026 style hex
3559         try:
3560             video_url = video_url.decode("unicode_escape")
3561         except AttributeError: # Python 3
3562             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3563
3564
3565         return [{
3566             'id':       video_id,
3567             'url':      video_url,
3568             'uploader': uploader,
3569             'upload_date':  upload_date,
3570             'title':    video_title,
3571             'ext':      video_extension,
3572         }]
3573
3574 class NBAIE(InfoExtractor):
3575     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3576     IE_NAME = u'nba'
3577
3578     def _real_extract(self, url):
3579         mobj = re.match(self._VALID_URL, url)
3580         if mobj is None:
3581             self._downloader.report_error(u'invalid URL: %s' % url)
3582             return
3583
3584         video_id = mobj.group(1)
3585         if video_id.endswith('/index.html'):
3586             video_id = video_id[:-len('/index.html')]
3587
3588         webpage = self._download_webpage(url, video_id)
3589
3590         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3591         def _findProp(rexp, default=None):
3592             m = re.search(rexp, webpage)
3593             if m:
3594                 return unescapeHTML(m.group(1))
3595             else:
3596                 return default
3597
3598         shortened_video_id = video_id.rpartition('/')[2]
3599         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3600         info = {
3601             'id': shortened_video_id,
3602             'url': video_url,
3603             'ext': 'mp4',
3604             'title': title,
3605             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3606             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3607         }
3608         return [info]
3609
3610 class JustinTVIE(InfoExtractor):
3611     """Information extractor for justin.tv and twitch.tv"""
3612     # TODO: One broadcast may be split into multiple videos. The key
3613     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3614     # starts at 1 and increases. Can we treat all parts as one video?
3615
3616     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3617         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3618     _JUSTIN_PAGE_LIMIT = 100
3619     IE_NAME = u'justin.tv'
3620
3621     def report_extraction(self, file_id):
3622         """Report information extraction."""
3623         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3624
3625     def report_download_page(self, channel, offset):
3626         """Report attempt to download a single page of videos."""
3627         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3628                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3629
3630     # Return count of items, list of *valid* items
3631     def _parse_page(self, url):
3632         try:
3633             urlh = compat_urllib_request.urlopen(url)
3634             webpage_bytes = urlh.read()
3635             webpage = webpage_bytes.decode('utf-8', 'ignore')
3636         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3637             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3638             return
3639
3640         response = json.loads(webpage)
3641         if type(response) != list:
3642             error_text = response.get('error', 'unknown error')
3643             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3644             return
3645         info = []
3646         for clip in response:
3647             video_url = clip['video_file_url']
3648             if video_url:
3649                 video_extension = os.path.splitext(video_url)[1][1:]
3650                 video_date = re.sub('-', '', clip['start_time'][:10])
3651                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3652                 video_id = clip['id']
3653                 video_title = clip.get('title', video_id)
3654                 info.append({
3655                     'id': video_id,
3656                     'url': video_url,
3657                     'title': video_title,
3658                     'uploader': clip.get('channel_name', video_uploader_id),
3659                     'uploader_id': video_uploader_id,
3660                     'upload_date': video_date,
3661                     'ext': video_extension,
3662                 })
3663         return (len(response), info)
3664
3665     def _real_extract(self, url):
3666         mobj = re.match(self._VALID_URL, url)
3667         if mobj is None:
3668             self._downloader.report_error(u'invalid URL: %s' % url)
3669             return
3670
3671         api = 'http://api.justin.tv'
3672         video_id = mobj.group(mobj.lastindex)
3673         paged = False
3674         if mobj.lastindex == 1:
3675             paged = True
3676             api += '/channel/archives/%s.json'
3677         else:
3678             api += '/broadcast/by_archive/%s.json'
3679         api = api % (video_id,)
3680
3681         self.report_extraction(video_id)
3682
3683         info = []
3684         offset = 0
3685         limit = self._JUSTIN_PAGE_LIMIT
3686         while True:
3687             if paged:
3688                 self.report_download_page(video_id, offset)
3689             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3690             page_count, page_info = self._parse_page(page_url)
3691             info.extend(page_info)
3692             if not paged or page_count != limit:
3693                 break
3694             offset += limit
3695         return info
3696
3697 class FunnyOrDieIE(InfoExtractor):
3698     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3699
3700     def _real_extract(self, url):
3701         mobj = re.match(self._VALID_URL, url)
3702         if mobj is None:
3703             self._downloader.report_error(u'invalid URL: %s' % url)
3704             return
3705
3706         video_id = mobj.group('id')
3707         webpage = self._download_webpage(url, video_id)
3708
3709         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3710         if not m:
3711             self._downloader.report_error(u'unable to find video information')
3712         video_url = unescapeHTML(m.group('url'))
3713
3714         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3715         if not m:
3716             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3717             if not m:
3718                 self._downloader.trouble(u'Cannot find video title')
3719         title = clean_html(m.group('title'))
3720
3721         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3722         if m:
3723             desc = unescapeHTML(m.group('desc'))
3724         else:
3725             desc = None
3726
3727         info = {
3728             'id': video_id,
3729             'url': video_url,
3730             'ext': 'mp4',
3731             'title': title,
3732             'description': desc,
3733         }
3734         return [info]
3735
3736 class SteamIE(InfoExtractor):
3737     _VALID_URL = r"""http://store.steampowered.com/
3738                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3739                 (?P<gameID>\d+)/?
3740                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3741                 """
3742
3743     @classmethod
3744     def suitable(cls, url):
3745         """Receives a URL and returns True if suitable for this IE."""
3746         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3747
3748     def _real_extract(self, url):
3749         m = re.match(self._VALID_URL, url, re.VERBOSE)
3750         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3751         gameID = m.group('gameID')
3752         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3753         webpage = self._download_webpage(videourl, gameID)
3754         mweb = re.finditer(urlRE, webpage)
3755         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3756         titles = re.finditer(namesRE, webpage)
3757         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3758         thumbs = re.finditer(thumbsRE, webpage)
3759         videos = []
3760         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3761             video_id = vid.group('videoID')
3762             title = vtitle.group('videoName')
3763             video_url = vid.group('videoURL')
3764             video_thumb = thumb.group('thumbnail')
3765             if not video_url:
3766                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3767             info = {
3768                 'id':video_id,
3769                 'url':video_url,
3770                 'ext': 'flv',
3771                 'title': unescapeHTML(title),
3772                 'thumbnail': video_thumb
3773                   }
3774             videos.append(info)
3775         return videos
3776
3777 class UstreamIE(InfoExtractor):
3778     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3779     IE_NAME = u'ustream'
3780
3781     def _real_extract(self, url):
3782         m = re.match(self._VALID_URL, url)
3783         video_id = m.group('videoID')
3784         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3785         webpage = self._download_webpage(url, video_id)
3786         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3787         title = m.group('title')
3788         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3789         uploader = m.group('uploader')
3790         info = {
3791                 'id':video_id,
3792                 'url':video_url,
3793                 'ext': 'flv',
3794                 'title': title,
3795                 'uploader': uploader
3796                   }
3797         return [info]
3798
3799 class WorldStarHipHopIE(InfoExtractor):
3800     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3801     IE_NAME = u'WorldStarHipHop'
3802
3803     def _real_extract(self, url):
3804         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3805
3806         webpage_src = compat_urllib_request.urlopen(url).read()
3807         webpage_src = webpage_src.decode('utf-8')
3808
3809         mobj = re.search(_src_url, webpage_src)
3810
3811         m = re.match(self._VALID_URL, url)
3812         video_id = m.group('id')
3813
3814         if mobj is not None:
3815             video_url = mobj.group()
3816             if 'mp4' in video_url:
3817                 ext = 'mp4'
3818             else:
3819                 ext = 'flv'
3820         else:
3821             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3822             return
3823
3824         _title = r"""<title>(.*)</title>"""
3825
3826         mobj = re.search(_title, webpage_src)
3827
3828         if mobj is not None:
3829             title = mobj.group(1)
3830         else:
3831             title = 'World Start Hip Hop - %s' % time.ctime()
3832
3833         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3834         mobj = re.search(_thumbnail, webpage_src)
3835
3836         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3837         if mobj is not None:
3838             thumbnail = mobj.group(1)
3839         else:
3840             _title = r"""candytitles.*>(.*)</span>"""
3841             mobj = re.search(_title, webpage_src)
3842             if mobj is not None:
3843                 title = mobj.group(1)
3844             thumbnail = None
3845
3846         results = [{
3847                     'id': video_id,
3848                     'url' : video_url,
3849                     'title' : title,
3850                     'thumbnail' : thumbnail,
3851                     'ext' : ext,
3852                     }]
3853         return results
3854
3855 class RBMARadioIE(InfoExtractor):
3856     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3857
3858     def _real_extract(self, url):
3859         m = re.match(self._VALID_URL, url)
3860         video_id = m.group('videoID')
3861
3862         webpage = self._download_webpage(url, video_id)
3863         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3864         if not m:
3865             raise ExtractorError(u'Cannot find metadata')
3866         json_data = m.group(1)
3867
3868         try:
3869             data = json.loads(json_data)
3870         except ValueError as e:
3871             raise ExtractorError(u'Invalid JSON: ' + str(e))
3872
3873         video_url = data['akamai_url'] + '&cbr=256'
3874         url_parts = compat_urllib_parse_urlparse(video_url)
3875         video_ext = url_parts.path.rpartition('.')[2]
3876         info = {
3877                 'id': video_id,
3878                 'url': video_url,
3879                 'ext': video_ext,
3880                 'title': data['title'],
3881                 'description': data.get('teaser_text'),
3882                 'location': data.get('country_of_origin'),
3883                 'uploader': data.get('host', {}).get('name'),
3884                 'uploader_id': data.get('host', {}).get('slug'),
3885                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3886                 'duration': data.get('duration'),
3887         }
3888         return [info]
3889
3890
3891 class YouPornIE(InfoExtractor):
3892     """Information extractor for youporn.com."""
3893     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3894
3895     def _print_formats(self, formats):
3896         """Print all available formats"""
3897         print(u'Available formats:')
3898         print(u'ext\t\tformat')
3899         print(u'---------------------------------')
3900         for format in formats:
3901             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3902
3903     def _specific(self, req_format, formats):
3904         for x in formats:
3905             if(x["format"]==req_format):
3906                 return x
3907         return None
3908
3909     def _real_extract(self, url):
3910         mobj = re.match(self._VALID_URL, url)
3911         if mobj is None:
3912             self._downloader.report_error(u'invalid URL: %s' % url)
3913             return
3914
3915         video_id = mobj.group('videoid')
3916
3917         req = compat_urllib_request.Request(url)
3918         req.add_header('Cookie', 'age_verified=1')
3919         webpage = self._download_webpage(req, video_id)
3920
3921         # Get the video title
3922         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3923         if result is None:
3924             raise ExtractorError(u'Unable to extract video title')
3925         video_title = result.group('title').strip()
3926
3927         # Get the video date
3928         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3929         if result is None:
3930             self._downloader.report_warning(u'unable to extract video date')
3931             upload_date = None
3932         else:
3933             upload_date = result.group('date').strip()
3934
3935         # Get the video uploader
3936         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3937         if result is None:
3938             self._downloader.report_warning(u'unable to extract uploader')
3939             video_uploader = None
3940         else:
3941             video_uploader = result.group('uploader').strip()
3942             video_uploader = clean_html( video_uploader )
3943
3944         # Get all of the formats available
3945         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3946         result = re.search(DOWNLOAD_LIST_RE, webpage)
3947         if result is None:
3948             raise ExtractorError(u'Unable to extract download list')
3949         download_list_html = result.group('download_list').strip()
3950
3951         # Get all of the links from the page
3952         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3953         links = re.findall(LINK_RE, download_list_html)
3954         if(len(links) == 0):
3955             raise ExtractorError(u'ERROR: no known formats available for video')
3956
3957         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3958
3959         formats = []
3960         for link in links:
3961
3962             # A link looks like this:
3963             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3964             # A path looks like this:
3965             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3966             video_url = unescapeHTML( link )
3967             path = compat_urllib_parse_urlparse( video_url ).path
3968             extension = os.path.splitext( path )[1][1:]
3969             format = path.split('/')[4].split('_')[:2]
3970             size = format[0]
3971             bitrate = format[1]
3972             format = "-".join( format )
3973             title = u'%s-%s-%s' % (video_title, size, bitrate)
3974
3975             formats.append({
3976                 'id': video_id,
3977                 'url': video_url,
3978                 'uploader': video_uploader,
3979                 'upload_date': upload_date,
3980                 'title': title,
3981                 'ext': extension,
3982                 'format': format,
3983                 'thumbnail': None,
3984                 'description': None,
3985                 'player_url': None
3986             })
3987
3988         if self._downloader.params.get('listformats', None):
3989             self._print_formats(formats)
3990             return
3991
3992         req_format = self._downloader.params.get('format', None)
3993         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3994
3995         if req_format is None or req_format == 'best':
3996             return [formats[0]]
3997         elif req_format == 'worst':
3998             return [formats[-1]]
3999         elif req_format in ('-1', 'all'):
4000             return formats
4001         else:
4002             format = self._specific( req_format, formats )
4003             if result is None:
4004                 self._downloader.report_error(u'requested format not available')
4005                 return
4006             return [format]
4007
4008
4009
4010 class PornotubeIE(InfoExtractor):
4011     """Information extractor for pornotube.com."""
4012     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4013
4014     def _real_extract(self, url):
4015         mobj = re.match(self._VALID_URL, url)
4016         if mobj is None:
4017             self._downloader.report_error(u'invalid URL: %s' % url)
4018             return
4019
4020         video_id = mobj.group('videoid')
4021         video_title = mobj.group('title')
4022
4023         # Get webpage content
4024         webpage = self._download_webpage(url, video_id)
4025
4026         # Get the video URL
4027         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4028         result = re.search(VIDEO_URL_RE, webpage)
4029         if result is None:
4030             self._downloader.report_error(u'unable to extract video url')
4031             return
4032         video_url = compat_urllib_parse.unquote(result.group('url'))
4033
4034         #Get the uploaded date
4035         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4036         result = re.search(VIDEO_UPLOADED_RE, webpage)
4037         if result is None:
4038             self._downloader.report_error(u'unable to extract video title')
4039             return
4040         upload_date = result.group('date')
4041
4042         info = {'id': video_id,
4043                 'url': video_url,
4044                 'uploader': None,
4045                 'upload_date': upload_date,
4046                 'title': video_title,
4047                 'ext': 'flv',
4048                 'format': 'flv'}
4049
4050         return [info]
4051
4052 class YouJizzIE(InfoExtractor):
4053     """Information extractor for youjizz.com."""
4054     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4055
4056     def _real_extract(self, url):
4057         mobj = re.match(self._VALID_URL, url)
4058         if mobj is None:
4059             self._downloader.report_error(u'invalid URL: %s' % url)
4060             return
4061
4062         video_id = mobj.group('videoid')
4063
4064         # Get webpage content
4065         webpage = self._download_webpage(url, video_id)
4066
4067         # Get the video title
4068         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4069         if result is None:
4070             raise ExtractorError(u'ERROR: unable to extract video title')
4071         video_title = result.group('title').strip()
4072
4073         # Get the embed page
4074         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4075         if result is None:
4076             raise ExtractorError(u'ERROR: unable to extract embed page')
4077
4078         embed_page_url = result.group(0).strip()
4079         video_id = result.group('videoid')
4080
4081         webpage = self._download_webpage(embed_page_url, video_id)
4082
4083         # Get the video URL
4084         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4085         if result is None:
4086             raise ExtractorError(u'ERROR: unable to extract video url')
4087         video_url = result.group('source')
4088
4089         info = {'id': video_id,
4090                 'url': video_url,
4091                 'title': video_title,
4092                 'ext': 'flv',
4093                 'format': 'flv',
4094                 'player_url': embed_page_url}
4095
4096         return [info]
4097
4098 class EightTracksIE(InfoExtractor):
4099     IE_NAME = '8tracks'
4100     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4101
4102     def _real_extract(self, url):
4103         mobj = re.match(self._VALID_URL, url)
4104         if mobj is None:
4105             raise ExtractorError(u'Invalid URL: %s' % url)
4106         playlist_id = mobj.group('id')
4107
4108         webpage = self._download_webpage(url, playlist_id)
4109
4110         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4111         if not m:
4112             raise ExtractorError(u'Cannot find trax information')
4113         json_like = m.group(1)
4114         data = json.loads(json_like)
4115
4116         session = str(random.randint(0, 1000000000))
4117         mix_id = data['id']
4118         track_count = data['tracks_count']
4119         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4120         next_url = first_url
4121         res = []
4122         for i in itertools.count():
4123             api_json = self._download_webpage(next_url, playlist_id,
4124                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4125                 errnote=u'Failed to download song information')
4126             api_data = json.loads(api_json)
4127             track_data = api_data[u'set']['track']
4128             info = {
4129                 'id': track_data['id'],
4130                 'url': track_data['track_file_stream_url'],
4131                 'title': track_data['performer'] + u' - ' + track_data['name'],
4132                 'raw_title': track_data['name'],
4133                 'uploader_id': data['user']['login'],
4134                 'ext': 'm4a',
4135             }
4136             res.append(info)
4137             if api_data['set']['at_last_track']:
4138                 break
4139             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4140         return res
4141
4142 class KeekIE(InfoExtractor):
4143     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4144     IE_NAME = u'keek'
4145
4146     def _real_extract(self, url):
4147         m = re.match(self._VALID_URL, url)
4148         video_id = m.group('videoID')
4149         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4150         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4151         webpage = self._download_webpage(url, video_id)
4152         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4153         title = unescapeHTML(m.group('title'))
4154         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4155         uploader = clean_html(m.group('uploader'))
4156         info = {
4157                 'id': video_id,
4158                 'url': video_url,
4159                 'ext': 'mp4',
4160                 'title': title,
4161                 'thumbnail': thumbnail,
4162                 'uploader': uploader
4163         }
4164         return [info]
4165
4166 class TEDIE(InfoExtractor):
4167     _VALID_URL=r'''http://www.ted.com/
4168                    (
4169                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4170                         |
4171                         ((?P<type_talk>talks)) # We have a simple talk
4172                    )
4173                    /(?P<name>\w+) # Here goes the name and then ".html"
4174                    '''
4175
4176     @classmethod
4177     def suitable(cls, url):
4178         """Receives a URL and returns True if suitable for this IE."""
4179         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4180
4181     def _real_extract(self, url):
4182         m=re.match(self._VALID_URL, url, re.VERBOSE)
4183         if m.group('type_talk'):
4184             return [self._talk_info(url)]
4185         else :
4186             playlist_id=m.group('playlist_id')
4187             name=m.group('name')
4188             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4189             return self._playlist_videos_info(url,name,playlist_id)
4190
4191     def _talk_video_link(self,mediaSlug):
4192         '''Returns the video link for that mediaSlug'''
4193         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4194
4195     def _playlist_videos_info(self,url,name,playlist_id=0):
4196         '''Returns the videos of the playlist'''
4197         video_RE=r'''
4198                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4199                      ([.\s]*?)data-playlist_item_id="(\d+)"
4200                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4201                      '''
4202         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4203         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4204         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4205         m_names=re.finditer(video_name_RE,webpage)
4206         info=[]
4207         for m_video, m_name in zip(m_videos,m_names):
4208             video_id=m_video.group('video_id')
4209             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4210             info.append(self._talk_info(talk_url,video_id))
4211         return info
4212
4213     def _talk_info(self, url, video_id=0):
4214         """Return the video for the talk in the url"""
4215         m=re.match(self._VALID_URL, url,re.VERBOSE)
4216         videoName=m.group('name')
4217         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4218         # If the url includes the language we get the title translated
4219         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4220         title=re.search(title_RE, webpage).group('title')
4221         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4222                         "id":(?P<videoID>[\d]+).*?
4223                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4224         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4225         thumb_match=re.search(thumb_RE,webpage)
4226         info_match=re.search(info_RE,webpage,re.VERBOSE)
4227         video_id=info_match.group('videoID')
4228         mediaSlug=info_match.group('mediaSlug')
4229         video_url=self._talk_video_link(mediaSlug)
4230         info = {
4231                 'id': video_id,
4232                 'url': video_url,
4233                 'ext': 'mp4',
4234                 'title': title,
4235                 'thumbnail': thumb_match.group('thumbnail')
4236                 }
4237         return info
4238
4239 class MySpassIE(InfoExtractor):
4240     _VALID_URL = r'http://www.myspass.de/.*'
4241
4242     def _real_extract(self, url):
4243         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4244
4245         # video id is the last path element of the URL
4246         # usually there is a trailing slash, so also try the second but last
4247         url_path = compat_urllib_parse_urlparse(url).path
4248         url_parent_path, video_id = os.path.split(url_path)
4249         if not video_id:
4250             _, video_id = os.path.split(url_parent_path)
4251
4252         # get metadata
4253         metadata_url = META_DATA_URL_TEMPLATE % video_id
4254         metadata_text = self._download_webpage(metadata_url, video_id)
4255         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4256
4257         # extract values from metadata
4258         url_flv_el = metadata.find('url_flv')
4259         if url_flv_el is None:
4260             self._downloader.report_error(u'unable to extract download url')
4261             return
4262         video_url = url_flv_el.text
4263         extension = os.path.splitext(video_url)[1][1:]
4264         title_el = metadata.find('title')
4265         if title_el is None:
4266             self._downloader.report_error(u'unable to extract title')
4267             return
4268         title = title_el.text
4269         format_id_el = metadata.find('format_id')
4270         if format_id_el is None:
4271             format = ext
4272         else:
4273             format = format_id_el.text
4274         description_el = metadata.find('description')
4275         if description_el is not None:
4276             description = description_el.text
4277         else:
4278             description = None
4279         imagePreview_el = metadata.find('imagePreview')
4280         if imagePreview_el is not None:
4281             thumbnail = imagePreview_el.text
4282         else:
4283             thumbnail = None
4284         info = {
4285             'id': video_id,
4286             'url': video_url,
4287             'title': title,
4288             'ext': extension,
4289             'format': format,
4290             'thumbnail': thumbnail,
4291             'description': description
4292         }
4293         return [info]
4294
4295 class SpiegelIE(InfoExtractor):
4296     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4297
4298     def _real_extract(self, url):
4299         m = re.match(self._VALID_URL, url)
4300         video_id = m.group('videoID')
4301
4302         webpage = self._download_webpage(url, video_id)
4303         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4304         if not m:
4305             raise ExtractorError(u'Cannot find title')
4306         video_title = unescapeHTML(m.group(1))
4307
4308         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4309         xml_code = self._download_webpage(xml_url, video_id,
4310                     note=u'Downloading XML', errnote=u'Failed to download XML')
4311
4312         idoc = xml.etree.ElementTree.fromstring(xml_code)
4313         last_type = idoc[-1]
4314         filename = last_type.findall('./filename')[0].text
4315         duration = float(last_type.findall('./duration')[0].text)
4316
4317         video_url = 'http://video2.spiegel.de/flash/' + filename
4318         video_ext = filename.rpartition('.')[2]
4319         info = {
4320             'id': video_id,
4321             'url': video_url,
4322             'ext': video_ext,
4323             'title': video_title,
4324             'duration': duration,
4325         }
4326         return [info]
4327
4328 class LiveLeakIE(InfoExtractor):
4329
4330     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4331     IE_NAME = u'liveleak'
4332
4333     def _real_extract(self, url):
4334         mobj = re.match(self._VALID_URL, url)
4335         if mobj is None:
4336             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4337             return
4338
4339         video_id = mobj.group('video_id')
4340
4341         webpage = self._download_webpage(url, video_id)
4342
4343         m = re.search(r'file: "(.*?)",', webpage)
4344         if not m:
4345             self._downloader.report_error(u'unable to find video url')
4346             return
4347         video_url = m.group(1)
4348
4349         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4350         if not m:
4351             self._downloader.trouble(u'Cannot find video title')
4352         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4353
4354         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4355         if m:
4356             desc = unescapeHTML(m.group('desc'))
4357         else:
4358             desc = None
4359
4360         m = re.search(r'By:.*?(\w+)</a>', webpage)
4361         if m:
4362             uploader = clean_html(m.group(1))
4363         else:
4364             uploader = None
4365
4366         info = {
4367             'id':  video_id,
4368             'url': video_url,
4369             'ext': 'mp4',
4370             'title': title,
4371             'description': desc,
4372             'uploader': uploader
4373         }
4374
4375         return [info]
4376
4377 class ARDIE(InfoExtractor):
4378     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4379     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4380     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4381
4382     def _real_extract(self, url):
4383         # determine video id from url
4384         m = re.match(self._VALID_URL, url)
4385
4386         numid = re.search(r'documentId=([0-9]+)', url)
4387         if numid:
4388             video_id = numid.group(1)
4389         else:
4390             video_id = m.group('video_id')
4391
4392         # determine title and media streams from webpage
4393         html = self._download_webpage(url, video_id)
4394         title = re.search(self._TITLE, html).group('title')
4395         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4396         if not streams:
4397             assert '"fsk"' in html
4398             self._downloader.report_error(u'this video is only available after 8:00 pm')
4399             return
4400
4401         # choose default media type and highest quality for now
4402         stream = max([s for s in streams if int(s["media_type"]) == 0],
4403                      key=lambda s: int(s["quality"]))
4404
4405         # there's two possibilities: RTMP stream or HTTP download
4406         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4407         if stream['rtmp_url']:
4408             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4409             assert stream['video_url'].startswith('mp4:')
4410             info["url"] = stream["rtmp_url"]
4411             info["play_path"] = stream['video_url']
4412         else:
4413             assert stream["video_url"].endswith('.mp4')
4414             info["url"] = stream["video_url"]
4415         return [info]
4416
4417
4418 def gen_extractors():
4419     """ Return a list of an instance of every supported extractor.
4420     The order does matter; the first extractor matched is the one handling the URL.
4421     """
4422     return [
4423         YoutubePlaylistIE(),
4424         YoutubeChannelIE(),
4425         YoutubeUserIE(),
4426         YoutubeSearchIE(),
4427         YoutubeIE(),
4428         MetacafeIE(),
4429         DailymotionIE(),
4430         GoogleSearchIE(),
4431         PhotobucketIE(),
4432         YahooIE(),
4433         YahooSearchIE(),
4434         DepositFilesIE(),
4435         FacebookIE(),
4436         BlipTVUserIE(),
4437         BlipTVIE(),
4438         VimeoIE(),
4439         MyVideoIE(),
4440         ComedyCentralIE(),
4441         EscapistIE(),
4442         CollegeHumorIE(),
4443         XVideosIE(),
4444         SoundcloudSetIE(),
4445         SoundcloudIE(),
4446         InfoQIE(),
4447         MixcloudIE(),
4448         StanfordOpenClassroomIE(),
4449         MTVIE(),
4450         YoukuIE(),
4451         XNXXIE(),
4452         YouJizzIE(),
4453         PornotubeIE(),
4454         YouPornIE(),
4455         GooglePlusIE(),
4456         ArteTvIE(),
4457         NBAIE(),
4458         WorldStarHipHopIE(),
4459         JustinTVIE(),
4460         FunnyOrDieIE(),
4461         SteamIE(),
4462         UstreamIE(),
4463         RBMARadioIE(),
4464         EightTracksIE(),
4465         KeekIE(),
4466         TEDIE(),
4467         MySpassIE(),
4468         SpiegelIE(),
4469         LiveLeakIE(),
4470         ARDIE(),
4471         GenericIE()
4472     ]