cf31970ef5a5b778de64a9a021483dcc7df192f8
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self._downloader.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147
148 class YoutubeIE(InfoExtractor):
149     """Information extractor for youtube.com."""
150
151     _VALID_URL = r"""^
152                      (
153                          (?:https?://)?                                       # http(s):// (optional)
154                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
156                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
157                          (?:                                                  # the various things that can precede the ID:
158                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
159                              |(?:                                             # or the v= param in all its forms
160                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
162                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
163                                  v=
164                              )
165                          )?                                                   # optional -> youtube.com/xxxx is OK
166                      )?                                                       # all until now is optional -> you can pass the naked ID
167                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
168                      (?(1).+)?                                                # if we found the ID, everything can follow
169                      $"""
170     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
171     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
172     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174     _NETRC_MACHINE = 'youtube'
175     # Listed in order of quality
176     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178     _video_extensions = {
179         '13': '3gp',
180         '17': 'mp4',
181         '18': 'mp4',
182         '22': 'mp4',
183         '37': 'mp4',
184         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
185         '43': 'webm',
186         '44': 'webm',
187         '45': 'webm',
188         '46': 'webm',
189     }
190     _video_dimensions = {
191         '5': '240x400',
192         '6': '???',
193         '13': '???',
194         '17': '144x176',
195         '18': '360x640',
196         '22': '720x1280',
197         '34': '360x640',
198         '35': '480x854',
199         '37': '1080x1920',
200         '38': '3072x4096',
201         '43': '360x640',
202         '44': '480x854',
203         '45': '720x1280',
204         '46': '1080x1920',
205     }
206     IE_NAME = u'youtube'
207
208     @classmethod
209     def suitable(cls, url):
210         """Receives a URL and returns True if suitable for this IE."""
211         if YoutubePlaylistIE.suitable(url): return False
212         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
213
214     def report_lang(self):
215         """Report attempt to set language."""
216         self._downloader.to_screen(u'[youtube] Setting language')
217
218     def report_login(self):
219         """Report attempt to log in."""
220         self._downloader.to_screen(u'[youtube] Logging in')
221
222     def report_age_confirmation(self):
223         """Report attempt to confirm age."""
224         self._downloader.to_screen(u'[youtube] Confirming age')
225
226     def report_video_webpage_download(self, video_id):
227         """Report attempt to download video webpage."""
228         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
229
230     def report_video_info_webpage_download(self, video_id):
231         """Report attempt to download video info webpage."""
232         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
233
234     def report_video_subtitles_download(self, video_id):
235         """Report attempt to download video info webpage."""
236         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
237
238     def report_video_subtitles_request(self, video_id, sub_lang, format):
239         """Report attempt to download video info webpage."""
240         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
241
242     def report_video_subtitles_available(self, video_id, sub_lang_list):
243         """Report available subtitles."""
244         sub_lang = ",".join(list(sub_lang_list.keys()))
245         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
246
247     def report_information_extraction(self, video_id):
248         """Report attempt to extract video information."""
249         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
250
251     def report_unavailable_format(self, video_id, format):
252         """Report extracted video URL."""
253         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
254
255     def report_rtmp_download(self):
256         """Indicate the download will use the RTMP protocol."""
257         self._downloader.to_screen(u'[youtube] RTMP download detected')
258
259     def _get_available_subtitles(self, video_id):
260         self.report_video_subtitles_download(video_id)
261         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
262         try:
263             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
265             return (u'unable to download video subtitles: %s' % compat_str(err), None)
266         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268         if not sub_lang_list:
269             return (u'video doesn\'t have subtitles', None)
270         return sub_lang_list
271
272     def _list_available_subtitles(self, video_id):
273         sub_lang_list = self._get_available_subtitles(video_id)
274         self.report_video_subtitles_available(video_id, sub_lang_list)
275
276     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
277         """
278         Return tuple:
279         (error_message, sub_lang, sub)
280         """
281         self.report_video_subtitles_request(video_id, sub_lang, format)
282         params = compat_urllib_parse.urlencode({
283             'lang': sub_lang,
284             'name': sub_name,
285             'v': video_id,
286             'fmt': format,
287         })
288         url = 'http://www.youtube.com/api/timedtext?' + params
289         try:
290             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
293         if not sub:
294             return (u'Did not fetch video subtitles', None, None)
295         return (None, sub_lang, sub)
296
297     def _extract_subtitle(self, video_id):
298         """
299         Return a list with a tuple:
300         [(error_message, sub_lang, sub)]
301         """
302         sub_lang_list = self._get_available_subtitles(video_id)
303         sub_format = self._downloader.params.get('subtitlesformat')
304         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305             return [(sub_lang_list[0], None, None)]
306         if self._downloader.params.get('subtitleslang', False):
307             sub_lang = self._downloader.params.get('subtitleslang')
308         elif 'en' in sub_lang_list:
309             sub_lang = 'en'
310         else:
311             sub_lang = list(sub_lang_list.keys())[0]
312         if not sub_lang in sub_lang_list:
313             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
314
315         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
316         return [subtitle]
317
318     def _extract_all_subtitles(self, video_id):
319         sub_lang_list = self._get_available_subtitles(video_id)
320         sub_format = self._downloader.params.get('subtitlesformat')
321         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322             return [(sub_lang_list[0], None, None)]
323         subtitles = []
324         for sub_lang in sub_lang_list:
325             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
326             subtitles.append(subtitle)
327         return subtitles
328
329     def _print_formats(self, formats):
330         print('Available formats:')
331         for x in formats:
332             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
333
334     def _real_initialize(self):
335         if self._downloader is None:
336             return
337
338         username = None
339         password = None
340         downloader_params = self._downloader.params
341
342         # Attempt to use provided username and password or .netrc data
343         if downloader_params.get('username', None) is not None:
344             username = downloader_params['username']
345             password = downloader_params['password']
346         elif downloader_params.get('usenetrc', False):
347             try:
348                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
349                 if info is not None:
350                     username = info[0]
351                     password = info[2]
352                 else:
353                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354             except (IOError, netrc.NetrcParseError) as err:
355                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
356                 return
357
358         # Set language
359         request = compat_urllib_request.Request(self._LANG_URL)
360         try:
361             self.report_lang()
362             compat_urllib_request.urlopen(request).read()
363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
365             return
366
367         # No authentication to be performed
368         if username is None:
369             return
370
371         request = compat_urllib_request.Request(self._LOGIN_URL)
372         try:
373             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
376             return
377
378         galx = None
379         dsh = None
380         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
381         if match:
382           galx = match.group(1)
383
384         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
385         if match:
386           dsh = match.group(1)
387
388         # Log in
389         login_form_strs = {
390                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
391                 u'Email': username,
392                 u'GALX': galx,
393                 u'Passwd': password,
394                 u'PersistentCookie': u'yes',
395                 u'_utf8': u'霱',
396                 u'bgresponse': u'js_disabled',
397                 u'checkConnection': u'',
398                 u'checkedDomains': u'youtube',
399                 u'dnConn': u'',
400                 u'dsh': dsh,
401                 u'pstMsg': u'0',
402                 u'rmShown': u'1',
403                 u'secTok': u'',
404                 u'signIn': u'Sign in',
405                 u'timeStmp': u'',
406                 u'service': u'youtube',
407                 u'uilel': u'3',
408                 u'hl': u'en_US',
409         }
410         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
411         # chokes on unicode
412         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
415         try:
416             self.report_login()
417             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
418             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
419                 self._downloader.report_warning(u'unable to log in: bad username or password')
420                 return
421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
423             return
424
425         # Confirm age
426         age_form = {
427                 'next_url':     '/',
428                 'action_confirm':   'Confirm',
429                 }
430         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
431         try:
432             self.report_age_confirmation()
433             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
436             return
437
438     def _extract_id(self, url):
439         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
440         if mobj is None:
441             self._downloader.report_error(u'invalid URL: %s' % url)
442             return
443         video_id = mobj.group(2)
444         return video_id
445
446     def _real_extract(self, url):
447         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448         mobj = re.search(self._NEXT_URL_RE, url)
449         if mobj:
450             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451         video_id = self._extract_id(url)
452
453         # Get video webpage
454         self.report_video_webpage_download(video_id)
455         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456         request = compat_urllib_request.Request(url)
457         try:
458             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
460             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
461             return
462
463         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
464
465         # Attempt to extract SWF player URL
466         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
467         if mobj is not None:
468             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
469         else:
470             player_url = None
471
472         # Get video info
473         self.report_video_info_webpage_download(video_id)
474         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
475             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
476                     % (video_id, el_type))
477             video_info_webpage = self._download_webpage(video_info_url, video_id,
478                                     note=False,
479                                     errnote='unable to download video info webpage')
480             video_info = compat_parse_qs(video_info_webpage)
481             if 'token' in video_info:
482                 break
483         if 'token' not in video_info:
484             if 'reason' in video_info:
485                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
486             else:
487                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
488             return
489
490         # Check for "rental" videos
491         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
492             self._downloader.report_error(u'"rental" videos not supported')
493             return
494
495         # Start extracting information
496         self.report_information_extraction(video_id)
497
498         # uploader
499         if 'author' not in video_info:
500             self._downloader.report_error(u'unable to extract uploader name')
501             return
502         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
503
504         # uploader_id
505         video_uploader_id = None
506         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
507         if mobj is not None:
508             video_uploader_id = mobj.group(1)
509         else:
510             self._downloader.report_warning(u'unable to extract uploader nickname')
511
512         # title
513         if 'title' not in video_info:
514             self._downloader.report_error(u'unable to extract video title')
515             return
516         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
517
518         # thumbnail image
519         if 'thumbnail_url' not in video_info:
520             self._downloader.report_warning(u'unable to extract video thumbnail')
521             video_thumbnail = ''
522         else:   # don't panic if we can't find it
523             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
524
525         # upload date
526         upload_date = None
527         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
528         if mobj is not None:
529             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531             for expression in format_expressions:
532                 try:
533                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
534                 except:
535                     pass
536
537         # description
538         video_description = get_element_by_id("eow-description", video_webpage)
539         if video_description:
540             video_description = clean_html(video_description)
541         else:
542             video_description = ''
543
544         # subtitles
545         video_subtitles = None
546
547         if self._downloader.params.get('writesubtitles', False):
548             video_subtitles = self._extract_subtitle(video_id)
549             if video_subtitles:
550                 (sub_error, sub_lang, sub) = video_subtitles[0]
551                 if sub_error:
552                     self._downloader.report_error(sub_error)
553
554         if self._downloader.params.get('allsubtitles', False):
555             video_subtitles = self._extract_all_subtitles(video_id)
556             for video_subtitle in video_subtitles:
557                 (sub_error, sub_lang, sub) = video_subtitle
558                 if sub_error:
559                     self._downloader.report_error(sub_error)
560
561         if self._downloader.params.get('listsubtitles', False):
562             sub_lang_list = self._list_available_subtitles(video_id)
563             return
564
565         if 'length_seconds' not in video_info:
566             self._downloader.report_warning(u'unable to extract video duration')
567             video_duration = ''
568         else:
569             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
570
571         # token
572         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
573
574         # Decide which formats to download
575         req_format = self._downloader.params.get('format', None)
576
577         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578             self.report_rtmp_download()
579             video_url_list = [(None, video_info['conn'][0])]
580         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
583             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
584             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
585
586             format_limit = self._downloader.params.get('format_limit', None)
587             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588             if format_limit is not None and format_limit in available_formats:
589                 format_list = available_formats[available_formats.index(format_limit):]
590             else:
591                 format_list = available_formats
592             existing_formats = [x for x in format_list if x in url_map]
593             if len(existing_formats) == 0:
594                 self._downloader.report_error(u'no known formats available for video')
595                 return
596             if self._downloader.params.get('listformats', None):
597                 self._print_formats(existing_formats)
598                 return
599             if req_format is None or req_format == 'best':
600                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601             elif req_format == 'worst':
602                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603             elif req_format in ('-1', 'all'):
604                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
605             else:
606                 # Specific formats. We pick the first in a slash-delimeted sequence.
607                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608                 req_formats = req_format.split('/')
609                 video_url_list = None
610                 for rf in req_formats:
611                     if rf in url_map:
612                         video_url_list = [(rf, url_map[rf])]
613                         break
614                 if video_url_list is None:
615                     self._downloader.report_error(u'requested format not available')
616                     return
617         else:
618             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
619             return
620
621         results = []
622         for format_param, video_real_url in video_url_list:
623             # Extension
624             video_extension = self._video_extensions.get(format_param, 'flv')
625
626             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627                                               self._video_dimensions.get(format_param, '???'))
628
629             results.append({
630                 'id':       video_id,
631                 'url':      video_real_url,
632                 'uploader': video_uploader,
633                 'uploader_id': video_uploader_id,
634                 'upload_date':  upload_date,
635                 'title':    video_title,
636                 'ext':      video_extension,
637                 'format':   video_format,
638                 'thumbnail':    video_thumbnail,
639                 'description':  video_description,
640                 'player_url':   player_url,
641                 'subtitles':    video_subtitles,
642                 'duration':     video_duration
643             })
644         return results
645
646
647 class MetacafeIE(InfoExtractor):
648     """Information Extractor for metacafe.com."""
649
650     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653     IE_NAME = u'metacafe'
654
655     def __init__(self, downloader=None):
656         InfoExtractor.__init__(self, downloader)
657
658     def report_disclaimer(self):
659         """Report disclaimer retrieval."""
660         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
661
662     def report_age_confirmation(self):
663         """Report attempt to confirm age."""
664         self._downloader.to_screen(u'[metacafe] Confirming age')
665
666     def report_download_webpage(self, video_id):
667         """Report webpage download."""
668         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
669
670     def report_extraction(self, video_id):
671         """Report information extraction."""
672         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
673
674     def _real_initialize(self):
675         # Retrieve disclaimer
676         request = compat_urllib_request.Request(self._DISCLAIMER)
677         try:
678             self.report_disclaimer()
679             disclaimer = compat_urllib_request.urlopen(request).read()
680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
682             return
683
684         # Confirm age
685         disclaimer_form = {
686             'filters': '0',
687             'submit': "Continue - I'm over 18",
688             }
689         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
690         try:
691             self.report_age_confirmation()
692             disclaimer = compat_urllib_request.urlopen(request).read()
693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
694             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
695             return
696
697     def _real_extract(self, url):
698         # Extract id and simplified title from URL
699         mobj = re.match(self._VALID_URL, url)
700         if mobj is None:
701             self._downloader.report_error(u'invalid URL: %s' % url)
702             return
703
704         video_id = mobj.group(1)
705
706         # Check if video comes from YouTube
707         mobj2 = re.match(r'^yt-(.*)$', video_id)
708         if mobj2 is not None:
709             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
710             return
711
712         # Retrieve video webpage to extract further information
713         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
714         try:
715             self.report_download_webpage(video_id)
716             webpage = compat_urllib_request.urlopen(request).read()
717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
719             return
720
721         # Extract URL, uploader and title from webpage
722         self.report_extraction(video_id)
723         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
724         if mobj is not None:
725             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726             video_extension = mediaURL[-3:]
727
728             # Extract gdaKey if available
729             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
730             if mobj is None:
731                 video_url = mediaURL
732             else:
733                 gdaKey = mobj.group(1)
734                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
735         else:
736             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
737             if mobj is None:
738                 self._downloader.report_error(u'unable to extract media URL')
739                 return
740             vardict = compat_parse_qs(mobj.group(1))
741             if 'mediaData' not in vardict:
742                 self._downloader.report_error(u'unable to extract media URL')
743                 return
744             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
745             if mobj is None:
746                 self._downloader.report_error(u'unable to extract media URL')
747                 return
748             mediaURL = mobj.group(1).replace('\\/', '/')
749             video_extension = mediaURL[-3:]
750             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
751
752         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
753         if mobj is None:
754             self._downloader.report_error(u'unable to extract title')
755             return
756         video_title = mobj.group(1).decode('utf-8')
757
758         mobj = re.search(r'submitter=(.*?);', webpage)
759         if mobj is None:
760             self._downloader.report_error(u'unable to extract uploader nickname')
761             return
762         video_uploader = mobj.group(1)
763
764         return [{
765             'id':       video_id.decode('utf-8'),
766             'url':      video_url.decode('utf-8'),
767             'uploader': video_uploader.decode('utf-8'),
768             'upload_date':  None,
769             'title':    video_title,
770             'ext':      video_extension.decode('utf-8'),
771         }]
772
773
774 class DailymotionIE(InfoExtractor):
775     """Information Extractor for Dailymotion"""
776
777     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778     IE_NAME = u'dailymotion'
779     _WORKING = False
780
781     def __init__(self, downloader=None):
782         InfoExtractor.__init__(self, downloader)
783
784     def report_extraction(self, video_id):
785         """Report information extraction."""
786         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
787
788     def _real_extract(self, url):
789         # Extract id and simplified title from URL
790         mobj = re.match(self._VALID_URL, url)
791         if mobj is None:
792             self._downloader.report_error(u'invalid URL: %s' % url)
793             return
794
795         video_id = mobj.group(1).split('_')[0].split('?')[0]
796
797         video_extension = 'mp4'
798
799         # Retrieve video webpage to extract further information
800         request = compat_urllib_request.Request(url)
801         request.add_header('Cookie', 'family_filter=off')
802         webpage = self._download_webpage(request, video_id)
803
804         # Extract URL, uploader and title from webpage
805         self.report_extraction(video_id)
806         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
807         if mobj is None:
808             self._downloader.report_error(u'unable to extract media URL')
809             return
810         flashvars = compat_urllib_parse.unquote(mobj.group(1))
811
812         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
813             if key in flashvars:
814                 max_quality = key
815                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
816                 break
817         else:
818             self._downloader.report_error(u'unable to extract video URL')
819             return
820
821         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
822         if mobj is None:
823             self._downloader.report_error(u'unable to extract video URL')
824             return
825
826         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
827
828         # TODO: support choosing qualities
829
830         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
831         if mobj is None:
832             self._downloader.report_error(u'unable to extract title')
833             return
834         video_title = unescapeHTML(mobj.group('title'))
835
836         video_uploader = None
837         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
838         if mobj is None:
839             # lookin for official user
840             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841             if mobj_official is None:
842                 self._downloader.report_warning(u'unable to extract uploader nickname')
843             else:
844                 video_uploader = mobj_official.group(1)
845         else:
846             video_uploader = mobj.group(1)
847
848         video_upload_date = None
849         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
850         if mobj is not None:
851             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
852
853         return [{
854             'id':       video_id,
855             'url':      video_url,
856             'uploader': video_uploader,
857             'upload_date':  video_upload_date,
858             'title':    video_title,
859             'ext':      video_extension,
860         }]
861
862
863 class PhotobucketIE(InfoExtractor):
864     """Information extractor for photobucket.com."""
865
866     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867     IE_NAME = u'photobucket'
868
869     def __init__(self, downloader=None):
870         InfoExtractor.__init__(self, downloader)
871
872     def report_download_webpage(self, video_id):
873         """Report webpage download."""
874         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
875
876     def report_extraction(self, video_id):
877         """Report information extraction."""
878         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
879
880     def _real_extract(self, url):
881         # Extract id from URL
882         mobj = re.match(self._VALID_URL, url)
883         if mobj is None:
884             self._downloader.report_error(u'Invalid URL: %s' % url)
885             return
886
887         video_id = mobj.group(1)
888
889         video_extension = 'flv'
890
891         # Retrieve video webpage to extract further information
892         request = compat_urllib_request.Request(url)
893         try:
894             self.report_download_webpage(video_id)
895             webpage = compat_urllib_request.urlopen(request).read()
896         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
898             return
899
900         # Extract URL, uploader, and title from webpage
901         self.report_extraction(video_id)
902         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
903         if mobj is None:
904             self._downloader.report_error(u'unable to extract media URL')
905             return
906         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
907
908         video_url = mediaURL
909
910         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
911         if mobj is None:
912             self._downloader.report_error(u'unable to extract title')
913             return
914         video_title = mobj.group(1).decode('utf-8')
915
916         video_uploader = mobj.group(2).decode('utf-8')
917
918         return [{
919             'id':       video_id.decode('utf-8'),
920             'url':      video_url.decode('utf-8'),
921             'uploader': video_uploader,
922             'upload_date':  None,
923             'title':    video_title,
924             'ext':      video_extension.decode('utf-8'),
925         }]
926
927
928 class YahooIE(InfoExtractor):
929     """Information extractor for video.yahoo.com."""
930
931     _WORKING = False
932     # _VALID_URL matches all Yahoo! Video URLs
933     # _VPAGE_URL matches only the extractable '/watch/' URLs
934     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936     IE_NAME = u'video.yahoo'
937
938     def __init__(self, downloader=None):
939         InfoExtractor.__init__(self, downloader)
940
941     def report_download_webpage(self, video_id):
942         """Report webpage download."""
943         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
944
945     def report_extraction(self, video_id):
946         """Report information extraction."""
947         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
948
949     def _real_extract(self, url, new_video=True):
950         # Extract ID from URL
951         mobj = re.match(self._VALID_URL, url)
952         if mobj is None:
953             self._downloader.report_error(u'Invalid URL: %s' % url)
954             return
955
956         video_id = mobj.group(2)
957         video_extension = 'flv'
958
959         # Rewrite valid but non-extractable URLs as
960         # extractable English language /watch/ URLs
961         if re.match(self._VPAGE_URL, url) is None:
962             request = compat_urllib_request.Request(url)
963             try:
964                 webpage = compat_urllib_request.urlopen(request).read()
965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
966                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
967                 return
968
969             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
970             if mobj is None:
971                 self._downloader.report_error(u'Unable to extract id field')
972                 return
973             yahoo_id = mobj.group(1)
974
975             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
976             if mobj is None:
977                 self._downloader.report_error(u'Unable to extract vid field')
978                 return
979             yahoo_vid = mobj.group(1)
980
981             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982             return self._real_extract(url, new_video=False)
983
984         # Retrieve video webpage to extract further information
985         request = compat_urllib_request.Request(url)
986         try:
987             self.report_download_webpage(video_id)
988             webpage = compat_urllib_request.urlopen(request).read()
989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
991             return
992
993         # Extract uploader and title from webpage
994         self.report_extraction(video_id)
995         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
996         if mobj is None:
997             self._downloader.report_error(u'unable to extract video title')
998             return
999         video_title = mobj.group(1).decode('utf-8')
1000
1001         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1002         if mobj is None:
1003             self._downloader.report_error(u'unable to extract video uploader')
1004             return
1005         video_uploader = mobj.group(1).decode('utf-8')
1006
1007         # Extract video thumbnail
1008         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1009         if mobj is None:
1010             self._downloader.report_error(u'unable to extract video thumbnail')
1011             return
1012         video_thumbnail = mobj.group(1).decode('utf-8')
1013
1014         # Extract video description
1015         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video description')
1018             return
1019         video_description = mobj.group(1).decode('utf-8')
1020         if not video_description:
1021             video_description = 'No description available.'
1022
1023         # Extract video height and width
1024         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video height')
1027             return
1028         yv_video_height = mobj.group(1)
1029
1030         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1031         if mobj is None:
1032             self._downloader.report_error(u'unable to extract video width')
1033             return
1034         yv_video_width = mobj.group(1)
1035
1036         # Retrieve video playlist to extract media URL
1037         # I'm not completely sure what all these options are, but we
1038         # seem to need most of them, otherwise the server sends a 401.
1039         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1040         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1041         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1044         try:
1045             self.report_download_webpage(video_id)
1046             webpage = compat_urllib_request.urlopen(request).read()
1047         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1049             return
1050
1051         # Extract media URL from playlist XML
1052         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1053         if mobj is None:
1054             self._downloader.report_error(u'Unable to extract media URL')
1055             return
1056         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057         video_url = unescapeHTML(video_url)
1058
1059         return [{
1060             'id':       video_id.decode('utf-8'),
1061             'url':      video_url,
1062             'uploader': video_uploader,
1063             'upload_date':  None,
1064             'title':    video_title,
1065             'ext':      video_extension.decode('utf-8'),
1066             'thumbnail':    video_thumbnail.decode('utf-8'),
1067             'description':  video_description,
1068         }]
1069
1070
1071 class VimeoIE(InfoExtractor):
1072     """Information extractor for vimeo.com."""
1073
1074     # _VALID_URL matches Vimeo URLs
1075     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1076     IE_NAME = u'vimeo'
1077
1078     def __init__(self, downloader=None):
1079         InfoExtractor.__init__(self, downloader)
1080
1081     def report_download_webpage(self, video_id):
1082         """Report webpage download."""
1083         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1084
1085     def report_extraction(self, video_id):
1086         """Report information extraction."""
1087         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1088
1089     def _real_extract(self, url, new_video=True):
1090         # Extract ID from URL
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             self._downloader.report_error(u'Invalid URL: %s' % url)
1094             return
1095
1096         video_id = mobj.group('id')
1097         if not mobj.group('proto'):
1098             url = 'https://' + url
1099         if mobj.group('direct_link'):
1100             url = 'https://vimeo.com/' + video_id
1101
1102         # Retrieve video webpage to extract further information
1103         request = compat_urllib_request.Request(url, None, std_headers)
1104         try:
1105             self.report_download_webpage(video_id)
1106             webpage_bytes = compat_urllib_request.urlopen(request).read()
1107             webpage = webpage_bytes.decode('utf-8')
1108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1110             return
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             self._downloader.report_error(u'unable to extract info section')
1123             return
1124
1125         # Extract title
1126         video_title = config["video"]["title"]
1127
1128         # Extract uploader and uploader_id
1129         video_uploader = config["video"]["owner"]["name"]
1130         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1131
1132         # Extract video thumbnail
1133         video_thumbnail = config["video"]["thumbnail"]
1134
1135         # Extract video description
1136         video_description = get_element_by_attribute("itemprop", "description", webpage)
1137         if video_description: video_description = clean_html(video_description)
1138         else: video_description = u''
1139
1140         # Extract upload date
1141         video_upload_date = None
1142         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143         if mobj is not None:
1144             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145
1146         # Vimeo specific: extract request signature and timestamp
1147         sig = config['request']['signature']
1148         timestamp = config['request']['timestamp']
1149
1150         # Vimeo specific: extract video codec and quality information
1151         # First consider quality, then codecs, then take everything
1152         # TODO bind to format param
1153         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154         files = { 'hd': [], 'sd': [], 'other': []}
1155         for codec_name, codec_extension in codecs:
1156             if codec_name in config["video"]["files"]:
1157                 if 'hd' in config["video"]["files"][codec_name]:
1158                     files['hd'].append((codec_name, codec_extension, 'hd'))
1159                 elif 'sd' in config["video"]["files"][codec_name]:
1160                     files['sd'].append((codec_name, codec_extension, 'sd'))
1161                 else:
1162                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163
1164         for quality in ('hd', 'sd', 'other'):
1165             if len(files[quality]) > 0:
1166                 video_quality = files[quality][0][2]
1167                 video_codec = files[quality][0][0]
1168                 video_extension = files[quality][0][1]
1169                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170                 break
1171         else:
1172             self._downloader.report_error(u'no known codec found')
1173             return
1174
1175         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1177
1178         return [{
1179             'id':       video_id,
1180             'url':      video_url,
1181             'uploader': video_uploader,
1182             'uploader_id': video_uploader_id,
1183             'upload_date':  video_upload_date,
1184             'title':    video_title,
1185             'ext':      video_extension,
1186             'thumbnail':    video_thumbnail,
1187             'description':  video_description,
1188         }]
1189
1190
1191 class ArteTvIE(InfoExtractor):
1192     """arte.tv information extractor."""
1193
1194     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195     _LIVE_URL = r'index-[0-9]+\.html$'
1196
1197     IE_NAME = u'arte.tv'
1198
1199     def __init__(self, downloader=None):
1200         InfoExtractor.__init__(self, downloader)
1201
1202     def report_download_webpage(self, video_id):
1203         """Report webpage download."""
1204         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1205
1206     def report_extraction(self, video_id):
1207         """Report information extraction."""
1208         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1209
1210     def fetch_webpage(self, url):
1211         request = compat_urllib_request.Request(url)
1212         try:
1213             self.report_download_webpage(url)
1214             webpage = compat_urllib_request.urlopen(request).read()
1215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1217             return
1218         except ValueError as err:
1219             self._downloader.report_error(u'Invalid URL: %s' % url)
1220             return
1221         return webpage
1222
1223     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224         page = self.fetch_webpage(url)
1225         mobj = re.search(regex, page, regexFlags)
1226         info = {}
1227
1228         if mobj is None:
1229             self._downloader.report_error(u'Invalid URL: %s' % url)
1230             return
1231
1232         for (i, key, err) in matchTuples:
1233             if mobj.group(i) is None:
1234                 self._downloader.trouble(err)
1235                 return
1236             else:
1237                 info[key] = mobj.group(i)
1238
1239         return info
1240
1241     def extractLiveStream(self, url):
1242         video_lang = url.split('/')[-4]
1243         info = self.grep_webpage(
1244             url,
1245             r'src="(.*?/videothek_js.*?\.js)',
1246             0,
1247             [
1248                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1249             ]
1250         )
1251         http_host = url.split('/')[2]
1252         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253         info = self.grep_webpage(
1254             next_url,
1255             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256                 '(http://.*?\.swf).*?' +
1257                 '(rtmp://.*?)\'',
1258             re.DOTALL,
1259             [
1260                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1261                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1263             ]
1264         )
1265         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1266
1267     def extractPlus7Stream(self, url):
1268         video_lang = url.split('/')[-3]
1269         info = self.grep_webpage(
1270             url,
1271             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1272             0,
1273             [
1274                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1275             ]
1276         )
1277         next_url = compat_urllib_parse.unquote(info.get('url'))
1278         info = self.grep_webpage(
1279             next_url,
1280             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1281             0,
1282             [
1283                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1284             ]
1285         )
1286         next_url = compat_urllib_parse.unquote(info.get('url'))
1287
1288         info = self.grep_webpage(
1289             next_url,
1290             r'<video id="(.*?)".*?>.*?' +
1291                 '<name>(.*?)</name>.*?' +
1292                 '<dateVideo>(.*?)</dateVideo>.*?' +
1293                 '<url quality="hd">(.*?)</url>',
1294             re.DOTALL,
1295             [
1296                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1297                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1299                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1300             ]
1301         )
1302
1303         return {
1304             'id':           info.get('id'),
1305             'url':          compat_urllib_parse.unquote(info.get('url')),
1306             'uploader':     u'arte.tv',
1307             'upload_date':  info.get('date'),
1308             'title':        info.get('title').decode('utf-8'),
1309             'ext':          u'mp4',
1310             'format':       u'NA',
1311             'player_url':   None,
1312         }
1313
1314     def _real_extract(self, url):
1315         video_id = url.split('/')[-1]
1316         self.report_extraction(video_id)
1317
1318         if re.search(self._LIVE_URL, video_id) is not None:
1319             self.extractLiveStream(url)
1320             return
1321         else:
1322             info = self.extractPlus7Stream(url)
1323
1324         return [info]
1325
1326
1327 class GenericIE(InfoExtractor):
1328     """Generic last-resort information extractor."""
1329
1330     _VALID_URL = r'.*'
1331     IE_NAME = u'generic'
1332
1333     def __init__(self, downloader=None):
1334         InfoExtractor.__init__(self, downloader)
1335
1336     def report_download_webpage(self, video_id):
1337         """Report webpage download."""
1338         if not self._downloader.params.get('test', False):
1339             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1341
1342     def report_extraction(self, video_id):
1343         """Report information extraction."""
1344         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1345
1346     def report_following_redirect(self, new_url):
1347         """Report information extraction."""
1348         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1349
1350     def _test_redirect(self, url):
1351         """Check if it is a redirect, like url shorteners, in case restart chain."""
1352         class HeadRequest(compat_urllib_request.Request):
1353             def get_method(self):
1354                 return "HEAD"
1355
1356         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1357             """
1358             Subclass the HTTPRedirectHandler to make it use our
1359             HeadRequest also on the redirected URL
1360             """
1361             def redirect_request(self, req, fp, code, msg, headers, newurl):
1362                 if code in (301, 302, 303, 307):
1363                     newurl = newurl.replace(' ', '%20')
1364                     newheaders = dict((k,v) for k,v in req.headers.items()
1365                                       if k.lower() not in ("content-length", "content-type"))
1366                     return HeadRequest(newurl,
1367                                        headers=newheaders,
1368                                        origin_req_host=req.get_origin_req_host(),
1369                                        unverifiable=True)
1370                 else:
1371                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1372
1373         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1374             """
1375             Fallback to GET if HEAD is not allowed (405 HTTP error)
1376             """
1377             def http_error_405(self, req, fp, code, msg, headers):
1378                 fp.read()
1379                 fp.close()
1380
1381                 newheaders = dict((k,v) for k,v in req.headers.items()
1382                                   if k.lower() not in ("content-length", "content-type"))
1383                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1384                                                  headers=newheaders,
1385                                                  origin_req_host=req.get_origin_req_host(),
1386                                                  unverifiable=True))
1387
1388         # Build our opener
1389         opener = compat_urllib_request.OpenerDirector()
1390         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391                         HTTPMethodFallback, HEADRedirectHandler,
1392                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393             opener.add_handler(handler())
1394
1395         response = opener.open(HeadRequest(url))
1396         new_url = response.geturl()
1397
1398         if url == new_url:
1399             return False
1400
1401         self.report_following_redirect(new_url)
1402         self._downloader.download([new_url])
1403         return True
1404
1405     def _real_extract(self, url):
1406         if self._test_redirect(url): return
1407
1408         video_id = url.split('/')[-1]
1409         try:
1410             webpage = self._download_webpage(url, video_id)
1411         except ValueError as err:
1412             # since this is the last-resort InfoExtractor, if
1413             # this error is thrown, it'll be thrown here
1414             self._downloader.report_error(u'Invalid URL: %s' % url)
1415             return
1416
1417         self.report_extraction(video_id)
1418         # Start with something easy: JW Player in SWFObject
1419         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1420         if mobj is None:
1421             # Broaden the search a little bit
1422             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1423         if mobj is None:
1424             # Broaden the search a little bit: JWPlayer JS loader
1425             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1426         if mobj is None:
1427             self._downloader.report_error(u'Invalid URL: %s' % url)
1428             return
1429
1430         # It's possible that one of the regexes
1431         # matched, but returned an empty group:
1432         if mobj.group(1) is None:
1433             self._downloader.report_error(u'Invalid URL: %s' % url)
1434             return
1435
1436         video_url = compat_urllib_parse.unquote(mobj.group(1))
1437         video_id = os.path.basename(video_url)
1438
1439         # here's a fun little line of code for you:
1440         video_extension = os.path.splitext(video_id)[1][1:]
1441         video_id = os.path.splitext(video_id)[0]
1442
1443         # it's tempting to parse this further, but you would
1444         # have to take into account all the variations like
1445         #   Video Title - Site Name
1446         #   Site Name | Video Title
1447         #   Video Title - Tagline | Site Name
1448         # and so on and so forth; it's just not practical
1449         mobj = re.search(r'<title>(.*)</title>', webpage)
1450         if mobj is None:
1451             self._downloader.report_error(u'unable to extract title')
1452             return
1453         video_title = mobj.group(1)
1454
1455         # video uploader is domain name
1456         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1457         if mobj is None:
1458             self._downloader.report_error(u'unable to extract title')
1459             return
1460         video_uploader = mobj.group(1)
1461
1462         return [{
1463             'id':       video_id,
1464             'url':      video_url,
1465             'uploader': video_uploader,
1466             'upload_date':  None,
1467             'title':    video_title,
1468             'ext':      video_extension,
1469         }]
1470
1471
1472 class YoutubeSearchIE(InfoExtractor):
1473     """Information Extractor for YouTube search queries."""
1474     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476     _max_youtube_results = 1000
1477     IE_NAME = u'youtube:search'
1478
1479     def __init__(self, downloader=None):
1480         InfoExtractor.__init__(self, downloader)
1481
1482     def report_download_page(self, query, pagenum):
1483         """Report attempt to download search page with given number."""
1484         query = query.decode(preferredencoding())
1485         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1486
1487     def _real_extract(self, query):
1488         mobj = re.match(self._VALID_URL, query)
1489         if mobj is None:
1490             self._downloader.report_error(u'invalid search query "%s"' % query)
1491             return
1492
1493         prefix, query = query.split(':')
1494         prefix = prefix[8:]
1495         query = query.encode('utf-8')
1496         if prefix == '':
1497             self._download_n_results(query, 1)
1498             return
1499         elif prefix == 'all':
1500             self._download_n_results(query, self._max_youtube_results)
1501             return
1502         else:
1503             try:
1504                 n = int(prefix)
1505                 if n <= 0:
1506                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1507                     return
1508                 elif n > self._max_youtube_results:
1509                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510                     n = self._max_youtube_results
1511                 self._download_n_results(query, n)
1512                 return
1513             except ValueError: # parsing prefix as integer fails
1514                 self._download_n_results(query, 1)
1515                 return
1516
1517     def _download_n_results(self, query, n):
1518         """Downloads a specified number of results for a query"""
1519
1520         video_ids = []
1521         pagenum = 0
1522         limit = n
1523
1524         while (50 * pagenum) < limit:
1525             self.report_download_page(query, pagenum+1)
1526             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527             request = compat_urllib_request.Request(result_url)
1528             try:
1529                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1532                 return
1533             api_response = json.loads(data)['data']
1534
1535             if not 'items' in api_response:
1536                 self._downloader.trouble(u'[youtube] No video results')
1537                 return
1538
1539             new_ids = list(video['id'] for video in api_response['items'])
1540             video_ids += new_ids
1541
1542             limit = min(n, api_response['totalItems'])
1543             pagenum += 1
1544
1545         if len(video_ids) > n:
1546             video_ids = video_ids[:n]
1547         for id in video_ids:
1548             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1549         return
1550
1551
1552 class GoogleSearchIE(InfoExtractor):
1553     """Information Extractor for Google Video search queries."""
1554     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558     _max_google_results = 1000
1559     IE_NAME = u'video.google:search'
1560
1561     def __init__(self, downloader=None):
1562         InfoExtractor.__init__(self, downloader)
1563
1564     def report_download_page(self, query, pagenum):
1565         """Report attempt to download playlist page with given number."""
1566         query = query.decode(preferredencoding())
1567         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1568
1569     def _real_extract(self, query):
1570         mobj = re.match(self._VALID_URL, query)
1571         if mobj is None:
1572             self._downloader.report_error(u'invalid search query "%s"' % query)
1573             return
1574
1575         prefix, query = query.split(':')
1576         prefix = prefix[8:]
1577         query = query.encode('utf-8')
1578         if prefix == '':
1579             self._download_n_results(query, 1)
1580             return
1581         elif prefix == 'all':
1582             self._download_n_results(query, self._max_google_results)
1583             return
1584         else:
1585             try:
1586                 n = int(prefix)
1587                 if n <= 0:
1588                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1589                     return
1590                 elif n > self._max_google_results:
1591                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592                     n = self._max_google_results
1593                 self._download_n_results(query, n)
1594                 return
1595             except ValueError: # parsing prefix as integer fails
1596                 self._download_n_results(query, 1)
1597                 return
1598
1599     def _download_n_results(self, query, n):
1600         """Downloads a specified number of results for a query"""
1601
1602         video_ids = []
1603         pagenum = 0
1604
1605         while True:
1606             self.report_download_page(query, pagenum)
1607             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608             request = compat_urllib_request.Request(result_url)
1609             try:
1610                 page = compat_urllib_request.urlopen(request).read()
1611             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1613                 return
1614
1615             # Extract video identifiers
1616             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617                 video_id = mobj.group(1)
1618                 if video_id not in video_ids:
1619                     video_ids.append(video_id)
1620                     if len(video_ids) == n:
1621                         # Specified n videos reached
1622                         for id in video_ids:
1623                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624                         return
1625
1626             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627                 for id in video_ids:
1628                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1629                 return
1630
1631             pagenum = pagenum + 1
1632
1633
1634 class YahooSearchIE(InfoExtractor):
1635     """Information Extractor for Yahoo! Video search queries."""
1636
1637     _WORKING = False
1638     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641     _MORE_PAGES_INDICATOR = r'\s*Next'
1642     _max_yahoo_results = 1000
1643     IE_NAME = u'video.yahoo:search'
1644
1645     def __init__(self, downloader=None):
1646         InfoExtractor.__init__(self, downloader)
1647
1648     def report_download_page(self, query, pagenum):
1649         """Report attempt to download playlist page with given number."""
1650         query = query.decode(preferredencoding())
1651         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1652
1653     def _real_extract(self, query):
1654         mobj = re.match(self._VALID_URL, query)
1655         if mobj is None:
1656             self._downloader.report_error(u'invalid search query "%s"' % query)
1657             return
1658
1659         prefix, query = query.split(':')
1660         prefix = prefix[8:]
1661         query = query.encode('utf-8')
1662         if prefix == '':
1663             self._download_n_results(query, 1)
1664             return
1665         elif prefix == 'all':
1666             self._download_n_results(query, self._max_yahoo_results)
1667             return
1668         else:
1669             try:
1670                 n = int(prefix)
1671                 if n <= 0:
1672                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1673                     return
1674                 elif n > self._max_yahoo_results:
1675                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676                     n = self._max_yahoo_results
1677                 self._download_n_results(query, n)
1678                 return
1679             except ValueError: # parsing prefix as integer fails
1680                 self._download_n_results(query, 1)
1681                 return
1682
1683     def _download_n_results(self, query, n):
1684         """Downloads a specified number of results for a query"""
1685
1686         video_ids = []
1687         already_seen = set()
1688         pagenum = 1
1689
1690         while True:
1691             self.report_download_page(query, pagenum)
1692             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693             request = compat_urllib_request.Request(result_url)
1694             try:
1695                 page = compat_urllib_request.urlopen(request).read()
1696             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1698                 return
1699
1700             # Extract video identifiers
1701             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702                 video_id = mobj.group(1)
1703                 if video_id not in already_seen:
1704                     video_ids.append(video_id)
1705                     already_seen.add(video_id)
1706                     if len(video_ids) == n:
1707                         # Specified n videos reached
1708                         for id in video_ids:
1709                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710                         return
1711
1712             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713                 for id in video_ids:
1714                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1715                 return
1716
1717             pagenum = pagenum + 1
1718
1719
1720 class YoutubePlaylistIE(InfoExtractor):
1721     """Information Extractor for YouTube playlists."""
1722
1723     _VALID_URL = r"""(?:
1724                         (?:https?://)?
1725                         (?:\w+\.)?
1726                         youtube\.com/
1727                         (?:
1728                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729                            \? (?:.*?&)*? (?:p|a|list)=
1730                         |  p/
1731                         )
1732                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1733                         .*
1734                      |
1735                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1736                      )"""
1737     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1738     _MAX_RESULTS = 50
1739     IE_NAME = u'youtube:playlist'
1740
1741     def __init__(self, downloader=None):
1742         InfoExtractor.__init__(self, downloader)
1743
1744     @classmethod
1745     def suitable(cls, url):
1746         """Receives a URL and returns True if suitable for this IE."""
1747         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1748
1749     def report_download_page(self, playlist_id, pagenum):
1750         """Report attempt to download playlist page with given number."""
1751         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1752
1753     def _real_extract(self, url):
1754         # Extract playlist id
1755         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1756         if mobj is None:
1757             self._downloader.report_error(u'invalid url: %s' % url)
1758             return
1759
1760         # Download playlist videos from API
1761         playlist_id = mobj.group(1) or mobj.group(2)
1762         page_num = 1
1763         videos = []
1764
1765         while True:
1766             self.report_download_page(playlist_id, page_num)
1767
1768             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1769             try:
1770                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1773                 return
1774
1775             try:
1776                 response = json.loads(page)
1777             except ValueError as err:
1778                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1779                 return
1780
1781             if not 'feed' in response or not 'entry' in response['feed']:
1782                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1783                 return
1784             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785                         for entry in response['feed']['entry']
1786                         if 'content' in entry ]
1787
1788             if len(response['feed']['entry']) < self._MAX_RESULTS:
1789                 break
1790             page_num += 1
1791
1792         videos = [v[1] for v in sorted(videos)]
1793         total = len(videos)
1794
1795         playliststart = self._downloader.params.get('playliststart', 1) - 1
1796         playlistend = self._downloader.params.get('playlistend', -1)
1797         if playlistend == -1:
1798             videos = videos[playliststart:]
1799         else:
1800             videos = videos[playliststart:playlistend]
1801
1802         if len(videos) == total:
1803             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1804         else:
1805             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1806
1807         for video in videos:
1808             self._downloader.download([video])
1809         return
1810
1811
1812 class YoutubeChannelIE(InfoExtractor):
1813     """Information Extractor for YouTube channels."""
1814
1815     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1817     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1818     IE_NAME = u'youtube:channel'
1819
1820     def report_download_page(self, channel_id, pagenum):
1821         """Report attempt to download channel page with given number."""
1822         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1823
1824     def _real_extract(self, url):
1825         # Extract channel id
1826         mobj = re.match(self._VALID_URL, url)
1827         if mobj is None:
1828             self._downloader.report_error(u'invalid url: %s' % url)
1829             return
1830
1831         # Download channel pages
1832         channel_id = mobj.group(1)
1833         video_ids = []
1834         pagenum = 1
1835
1836         while True:
1837             self.report_download_page(channel_id, pagenum)
1838             url = self._TEMPLATE_URL % (channel_id, pagenum)
1839             request = compat_urllib_request.Request(url)
1840             try:
1841                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1842             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1843                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1844                 return
1845
1846             # Extract video identifiers
1847             ids_in_page = []
1848             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849                 if mobj.group(1) not in ids_in_page:
1850                     ids_in_page.append(mobj.group(1))
1851             video_ids.extend(ids_in_page)
1852
1853             if self._MORE_PAGES_INDICATOR not in page:
1854                 break
1855             pagenum = pagenum + 1
1856
1857         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1858
1859         for id in video_ids:
1860             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1861         return
1862
1863
1864 class YoutubeUserIE(InfoExtractor):
1865     """Information Extractor for YouTube users."""
1866
1867     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869     _GDATA_PAGE_SIZE = 50
1870     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872     IE_NAME = u'youtube:user'
1873
1874     def __init__(self, downloader=None):
1875         InfoExtractor.__init__(self, downloader)
1876
1877     def report_download_page(self, username, start_index):
1878         """Report attempt to download user page."""
1879         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1881
1882     def _real_extract(self, url):
1883         # Extract username
1884         mobj = re.match(self._VALID_URL, url)
1885         if mobj is None:
1886             self._downloader.report_error(u'invalid url: %s' % url)
1887             return
1888
1889         username = mobj.group(1)
1890
1891         # Download video ids using YouTube Data API. Result size per
1892         # query is limited (currently to 50 videos) so we need to query
1893         # page by page until there are no video ids - it means we got
1894         # all of them.
1895
1896         video_ids = []
1897         pagenum = 0
1898
1899         while True:
1900             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901             self.report_download_page(username, start_index)
1902
1903             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1904
1905             try:
1906                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1909                 return
1910
1911             # Extract video identifiers
1912             ids_in_page = []
1913
1914             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915                 if mobj.group(1) not in ids_in_page:
1916                     ids_in_page.append(mobj.group(1))
1917
1918             video_ids.extend(ids_in_page)
1919
1920             # A little optimization - if current page is not
1921             # "full", ie. does not contain PAGE_SIZE video ids then
1922             # we can assume that this page is the last one - there
1923             # are no more ids on further pages - no need to query
1924             # again.
1925
1926             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1927                 break
1928
1929             pagenum += 1
1930
1931         all_ids_count = len(video_ids)
1932         playliststart = self._downloader.params.get('playliststart', 1) - 1
1933         playlistend = self._downloader.params.get('playlistend', -1)
1934
1935         if playlistend == -1:
1936             video_ids = video_ids[playliststart:]
1937         else:
1938             video_ids = video_ids[playliststart:playlistend]
1939
1940         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941                 (username, all_ids_count, len(video_ids)))
1942
1943         for video_id in video_ids:
1944             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1945
1946
1947 class BlipTVUserIE(InfoExtractor):
1948     """Information Extractor for blip.tv users."""
1949
1950     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1951     _PAGE_SIZE = 12
1952     IE_NAME = u'blip.tv:user'
1953
1954     def __init__(self, downloader=None):
1955         InfoExtractor.__init__(self, downloader)
1956
1957     def report_download_page(self, username, pagenum):
1958         """Report attempt to download user page."""
1959         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960                 (self.IE_NAME, username, pagenum))
1961
1962     def _real_extract(self, url):
1963         # Extract username
1964         mobj = re.match(self._VALID_URL, url)
1965         if mobj is None:
1966             self._downloader.report_error(u'invalid url: %s' % url)
1967             return
1968
1969         username = mobj.group(1)
1970
1971         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1972
1973         request = compat_urllib_request.Request(url)
1974
1975         try:
1976             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977             mobj = re.search(r'data-users-id="([^"]+)"', page)
1978             page_base = page_base % mobj.group(1)
1979         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1981             return
1982
1983
1984         # Download video ids using BlipTV Ajax calls. Result size per
1985         # query is limited (currently to 12 videos) so we need to query
1986         # page by page until there are no video ids - it means we got
1987         # all of them.
1988
1989         video_ids = []
1990         pagenum = 1
1991
1992         while True:
1993             self.report_download_page(username, pagenum)
1994             url = page_base + "&page=" + str(pagenum)
1995             request = compat_urllib_request.Request( url )
1996             try:
1997                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2000                 return
2001
2002             # Extract video identifiers
2003             ids_in_page = []
2004
2005             for mobj in re.finditer(r'href="/([^"]+)"', page):
2006                 if mobj.group(1) not in ids_in_page:
2007                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2008
2009             video_ids.extend(ids_in_page)
2010
2011             # A little optimization - if current page is not
2012             # "full", ie. does not contain PAGE_SIZE video ids then
2013             # we can assume that this page is the last one - there
2014             # are no more ids on further pages - no need to query
2015             # again.
2016
2017             if len(ids_in_page) < self._PAGE_SIZE:
2018                 break
2019
2020             pagenum += 1
2021
2022         all_ids_count = len(video_ids)
2023         playliststart = self._downloader.params.get('playliststart', 1) - 1
2024         playlistend = self._downloader.params.get('playlistend', -1)
2025
2026         if playlistend == -1:
2027             video_ids = video_ids[playliststart:]
2028         else:
2029             video_ids = video_ids[playliststart:playlistend]
2030
2031         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2033
2034         for video_id in video_ids:
2035             self._downloader.download([u'http://blip.tv/'+video_id])
2036
2037
2038 class DepositFilesIE(InfoExtractor):
2039     """Information extractor for depositfiles.com"""
2040
2041     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2042
2043     def report_download_webpage(self, file_id):
2044         """Report webpage download."""
2045         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2046
2047     def report_extraction(self, file_id):
2048         """Report information extraction."""
2049         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2050
2051     def _real_extract(self, url):
2052         file_id = url.split('/')[-1]
2053         # Rebuild url in english locale
2054         url = 'http://depositfiles.com/en/files/' + file_id
2055
2056         # Retrieve file webpage with 'Free download' button pressed
2057         free_download_indication = { 'gateway_result' : '1' }
2058         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2059         try:
2060             self.report_download_webpage(file_id)
2061             webpage = compat_urllib_request.urlopen(request).read()
2062         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2063             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2064             return
2065
2066         # Search for the real file URL
2067         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068         if (mobj is None) or (mobj.group(1) is None):
2069             # Try to figure out reason of the error.
2070             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071             if (mobj is not None) and (mobj.group(1) is not None):
2072                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2073                 self._downloader.report_error(u'%s' % restriction_message)
2074             else:
2075                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2076             return
2077
2078         file_url = mobj.group(1)
2079         file_extension = os.path.splitext(file_url)[1][1:]
2080
2081         # Search for file title
2082         mobj = re.search(r'<b title="(.*?)">', webpage)
2083         if mobj is None:
2084             self._downloader.report_error(u'unable to extract title')
2085             return
2086         file_title = mobj.group(1).decode('utf-8')
2087
2088         return [{
2089             'id':       file_id.decode('utf-8'),
2090             'url':      file_url.decode('utf-8'),
2091             'uploader': None,
2092             'upload_date':  None,
2093             'title':    file_title,
2094             'ext':      file_extension.decode('utf-8'),
2095         }]
2096
2097
2098 class FacebookIE(InfoExtractor):
2099     """Information Extractor for Facebook"""
2100
2101     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103     _NETRC_MACHINE = 'facebook'
2104     IE_NAME = u'facebook'
2105
2106     def report_login(self):
2107         """Report attempt to log in."""
2108         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2109
2110     def _real_initialize(self):
2111         if self._downloader is None:
2112             return
2113
2114         useremail = None
2115         password = None
2116         downloader_params = self._downloader.params
2117
2118         # Attempt to use provided username and password or .netrc data
2119         if downloader_params.get('username', None) is not None:
2120             useremail = downloader_params['username']
2121             password = downloader_params['password']
2122         elif downloader_params.get('usenetrc', False):
2123             try:
2124                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125                 if info is not None:
2126                     useremail = info[0]
2127                     password = info[2]
2128                 else:
2129                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130             except (IOError, netrc.NetrcParseError) as err:
2131                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2132                 return
2133
2134         if useremail is None:
2135             return
2136
2137         # Log in
2138         login_form = {
2139             'email': useremail,
2140             'pass': password,
2141             'login': 'Log+In'
2142             }
2143         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2144         try:
2145             self.report_login()
2146             login_results = compat_urllib_request.urlopen(request).read()
2147             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2148                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2149                 return
2150         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2152             return
2153
2154     def _real_extract(self, url):
2155         mobj = re.match(self._VALID_URL, url)
2156         if mobj is None:
2157             self._downloader.report_error(u'invalid URL: %s' % url)
2158             return
2159         video_id = mobj.group('ID')
2160
2161         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162         webpage = self._download_webpage(url, video_id)
2163
2164         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2165         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2167         if not m:
2168             raise ExtractorError(u'Cannot parse data')
2169         data = dict(json.loads(m.group(1)))
2170         params_raw = compat_urllib_parse.unquote(data['params'])
2171         params = json.loads(params_raw)
2172         video_data = params['video_data'][0]
2173         video_url = video_data.get('hd_src')
2174         if not video_url:
2175             video_url = video_data['sd_src']
2176         if not video_url:
2177             raise ExtractorError(u'Cannot find video URL')
2178         video_duration = int(video_data['video_duration'])
2179         thumbnail = video_data['thumbnail_src']
2180
2181         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2182         if not m:
2183             raise ExtractorError(u'Cannot find title in webpage')
2184         video_title = unescapeHTML(m.group(1))
2185
2186         info = {
2187             'id': video_id,
2188             'title': video_title,
2189             'url': video_url,
2190             'ext': 'mp4',
2191             'duration': video_duration,
2192             'thumbnail': thumbnail,
2193         }
2194         return [info]
2195
2196
2197 class BlipTVIE(InfoExtractor):
2198     """Information extractor for blip.tv"""
2199
2200     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2201     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2202     IE_NAME = u'blip.tv'
2203
2204     def report_extraction(self, file_id):
2205         """Report information extraction."""
2206         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2207
2208     def report_direct_download(self, title):
2209         """Report information extraction."""
2210         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2211
2212     def _real_extract(self, url):
2213         mobj = re.match(self._VALID_URL, url)
2214         if mobj is None:
2215             self._downloader.report_error(u'invalid URL: %s' % url)
2216             return
2217
2218         urlp = compat_urllib_parse_urlparse(url)
2219         if urlp.path.startswith('/play/'):
2220             request = compat_urllib_request.Request(url)
2221             response = compat_urllib_request.urlopen(request)
2222             redirecturl = response.geturl()
2223             rurlp = compat_urllib_parse_urlparse(redirecturl)
2224             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2225             url = 'http://blip.tv/a/a-' + file_id
2226             return self._real_extract(url)
2227
2228
2229         if '?' in url:
2230             cchar = '&'
2231         else:
2232             cchar = '?'
2233         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2234         request = compat_urllib_request.Request(json_url)
2235         request.add_header('User-Agent', 'iTunes/10.6.1')
2236         self.report_extraction(mobj.group(1))
2237         info = None
2238         try:
2239             urlh = compat_urllib_request.urlopen(request)
2240             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2241                 basename = url.split('/')[-1]
2242                 title,ext = os.path.splitext(basename)
2243                 title = title.decode('UTF-8')
2244                 ext = ext.replace('.', '')
2245                 self.report_direct_download(title)
2246                 info = {
2247                     'id': title,
2248                     'url': url,
2249                     'uploader': None,
2250                     'upload_date': None,
2251                     'title': title,
2252                     'ext': ext,
2253                     'urlhandle': urlh
2254                 }
2255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2256             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2257         if info is None: # Regular URL
2258             try:
2259                 json_code_bytes = urlh.read()
2260                 json_code = json_code_bytes.decode('utf-8')
2261             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2263                 return
2264
2265             try:
2266                 json_data = json.loads(json_code)
2267                 if 'Post' in json_data:
2268                     data = json_data['Post']
2269                 else:
2270                     data = json_data
2271
2272                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2273                 video_url = data['media']['url']
2274                 umobj = re.match(self._URL_EXT, video_url)
2275                 if umobj is None:
2276                     raise ValueError('Can not determine filename extension')
2277                 ext = umobj.group(1)
2278
2279                 info = {
2280                     'id': data['item_id'],
2281                     'url': video_url,
2282                     'uploader': data['display_name'],
2283                     'upload_date': upload_date,
2284                     'title': data['title'],
2285                     'ext': ext,
2286                     'format': data['media']['mimeType'],
2287                     'thumbnail': data['thumbnailUrl'],
2288                     'description': data['description'],
2289                     'player_url': data['embedUrl'],
2290                     'user_agent': 'iTunes/10.6.1',
2291                 }
2292             except (ValueError,KeyError) as err:
2293                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2294                 return
2295
2296         return [info]
2297
2298
2299 class MyVideoIE(InfoExtractor):
2300     """Information Extractor for myvideo.de."""
2301
2302     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2303     IE_NAME = u'myvideo'
2304
2305     def __init__(self, downloader=None):
2306         InfoExtractor.__init__(self, downloader)
2307
2308     def report_extraction(self, video_id):
2309         """Report information extraction."""
2310         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2311
2312     def _real_extract(self,url):
2313         mobj = re.match(self._VALID_URL, url)
2314         if mobj is None:
2315             self._download.report_error(u'invalid URL: %s' % url)
2316             return
2317
2318         video_id = mobj.group(1)
2319
2320         # Get video webpage
2321         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2322         webpage = self._download_webpage(webpage_url, video_id)
2323
2324         self.report_extraction(video_id)
2325         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2326                  webpage)
2327         if mobj is None:
2328             self._downloader.report_error(u'unable to extract media URL')
2329             return
2330         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2331
2332         mobj = re.search('<title>([^<]+)</title>', webpage)
2333         if mobj is None:
2334             self._downloader.report_error(u'unable to extract title')
2335             return
2336
2337         video_title = mobj.group(1)
2338
2339         return [{
2340             'id':       video_id,
2341             'url':      video_url,
2342             'uploader': None,
2343             'upload_date':  None,
2344             'title':    video_title,
2345             'ext':      u'flv',
2346         }]
2347
2348 class ComedyCentralIE(InfoExtractor):
2349     """Information extractor for The Daily Show and Colbert Report """
2350
2351     # urls can be abbreviations like :thedailyshow or :colbert
2352     # urls for episodes like:
2353     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2354     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2355     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2356     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2357                       |(https?://)?(www\.)?
2358                           (?P<showname>thedailyshow|colbertnation)\.com/
2359                          (full-episodes/(?P<episode>.*)|
2360                           (?P<clip>
2361                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2362                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2363                      $"""
2364
2365     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2366
2367     _video_extensions = {
2368         '3500': 'mp4',
2369         '2200': 'mp4',
2370         '1700': 'mp4',
2371         '1200': 'mp4',
2372         '750': 'mp4',
2373         '400': 'mp4',
2374     }
2375     _video_dimensions = {
2376         '3500': '1280x720',
2377         '2200': '960x540',
2378         '1700': '768x432',
2379         '1200': '640x360',
2380         '750': '512x288',
2381         '400': '384x216',
2382     }
2383
2384     @classmethod
2385     def suitable(cls, url):
2386         """Receives a URL and returns True if suitable for this IE."""
2387         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2388
2389     def report_extraction(self, episode_id):
2390         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2391
2392     def report_config_download(self, episode_id, media_id):
2393         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2394
2395     def report_index_download(self, episode_id):
2396         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2397
2398     def _print_formats(self, formats):
2399         print('Available formats:')
2400         for x in formats:
2401             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2402
2403
2404     def _real_extract(self, url):
2405         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2406         if mobj is None:
2407             self._downloader.report_error(u'invalid URL: %s' % url)
2408             return
2409
2410         if mobj.group('shortname'):
2411             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2412                 url = u'http://www.thedailyshow.com/full-episodes/'
2413             else:
2414                 url = u'http://www.colbertnation.com/full-episodes/'
2415             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2416             assert mobj is not None
2417
2418         if mobj.group('clip'):
2419             if mobj.group('showname') == 'thedailyshow':
2420                 epTitle = mobj.group('tdstitle')
2421             else:
2422                 epTitle = mobj.group('cntitle')
2423             dlNewest = False
2424         else:
2425             dlNewest = not mobj.group('episode')
2426             if dlNewest:
2427                 epTitle = mobj.group('showname')
2428             else:
2429                 epTitle = mobj.group('episode')
2430
2431         req = compat_urllib_request.Request(url)
2432         self.report_extraction(epTitle)
2433         try:
2434             htmlHandle = compat_urllib_request.urlopen(req)
2435             html = htmlHandle.read()
2436             webpage = html.decode('utf-8')
2437         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2438             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2439             return
2440         if dlNewest:
2441             url = htmlHandle.geturl()
2442             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2443             if mobj is None:
2444                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2445                 return
2446             if mobj.group('episode') == '':
2447                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2448                 return
2449             epTitle = mobj.group('episode')
2450
2451         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2452
2453         if len(mMovieParams) == 0:
2454             # The Colbert Report embeds the information in a without
2455             # a URL prefix; so extract the alternate reference
2456             # and then add the URL prefix manually.
2457
2458             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2459             if len(altMovieParams) == 0:
2460                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2461                 return
2462             else:
2463                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2464
2465         uri = mMovieParams[0][1]
2466         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2467         self.report_index_download(epTitle)
2468         try:
2469             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2472             return
2473
2474         results = []
2475
2476         idoc = xml.etree.ElementTree.fromstring(indexXml)
2477         itemEls = idoc.findall('.//item')
2478         for partNum,itemEl in enumerate(itemEls):
2479             mediaId = itemEl.findall('./guid')[0].text
2480             shortMediaId = mediaId.split(':')[-1]
2481             showId = mediaId.split(':')[-2].replace('.com', '')
2482             officialTitle = itemEl.findall('./title')[0].text
2483             officialDate = itemEl.findall('./pubDate')[0].text
2484
2485             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2486                         compat_urllib_parse.urlencode({'uri': mediaId}))
2487             configReq = compat_urllib_request.Request(configUrl)
2488             self.report_config_download(epTitle, shortMediaId)
2489             try:
2490                 configXml = compat_urllib_request.urlopen(configReq).read()
2491             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2493                 return
2494
2495             cdoc = xml.etree.ElementTree.fromstring(configXml)
2496             turls = []
2497             for rendition in cdoc.findall('.//rendition'):
2498                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2499                 turls.append(finfo)
2500
2501             if len(turls) == 0:
2502                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2503                 continue
2504
2505             if self._downloader.params.get('listformats', None):
2506                 self._print_formats([i[0] for i in turls])
2507                 return
2508
2509             # For now, just pick the highest bitrate
2510             format,rtmp_video_url = turls[-1]
2511
2512             # Get the format arg from the arg stream
2513             req_format = self._downloader.params.get('format', None)
2514
2515             # Select format if we can find one
2516             for f,v in turls:
2517                 if f == req_format:
2518                     format, rtmp_video_url = f, v
2519                     break
2520
2521             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2522             if not m:
2523                 raise ExtractorError(u'Cannot transform RTMP url')
2524             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2525             video_url = base + m.group('finalid')
2526
2527             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2528             info = {
2529                 'id': shortMediaId,
2530                 'url': video_url,
2531                 'uploader': showId,
2532                 'upload_date': officialDate,
2533                 'title': effTitle,
2534                 'ext': 'mp4',
2535                 'format': format,
2536                 'thumbnail': None,
2537                 'description': officialTitle,
2538             }
2539             results.append(info)
2540
2541         return results
2542
2543
2544 class EscapistIE(InfoExtractor):
2545     """Information extractor for The Escapist """
2546
2547     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2548     IE_NAME = u'escapist'
2549
2550     def report_extraction(self, showName):
2551         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2552
2553     def report_config_download(self, showName):
2554         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2555
2556     def _real_extract(self, url):
2557         mobj = re.match(self._VALID_URL, url)
2558         if mobj is None:
2559             self._downloader.report_error(u'invalid URL: %s' % url)
2560             return
2561         showName = mobj.group('showname')
2562         videoId = mobj.group('episode')
2563
2564         self.report_extraction(showName)
2565         try:
2566             webPage = compat_urllib_request.urlopen(url)
2567             webPageBytes = webPage.read()
2568             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2569             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2570         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2572             return
2573
2574         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2575         description = unescapeHTML(descMatch.group(1))
2576         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2577         imgUrl = unescapeHTML(imgMatch.group(1))
2578         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2579         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2580         configUrlMatch = re.search('config=(.*)$', playerUrl)
2581         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2582
2583         self.report_config_download(showName)
2584         try:
2585             configJSON = compat_urllib_request.urlopen(configUrl)
2586             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2587             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2589             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2590             return
2591
2592         # Technically, it's JavaScript, not JSON
2593         configJSON = configJSON.replace("'", '"')
2594
2595         try:
2596             config = json.loads(configJSON)
2597         except (ValueError,) as err:
2598             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2599             return
2600
2601         playlist = config['playlist']
2602         videoUrl = playlist[1]['url']
2603
2604         info = {
2605             'id': videoId,
2606             'url': videoUrl,
2607             'uploader': showName,
2608             'upload_date': None,
2609             'title': showName,
2610             'ext': 'mp4',
2611             'thumbnail': imgUrl,
2612             'description': description,
2613             'player_url': playerUrl,
2614         }
2615
2616         return [info]
2617
2618 class CollegeHumorIE(InfoExtractor):
2619     """Information extractor for collegehumor.com"""
2620
2621     _WORKING = False
2622     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2623     IE_NAME = u'collegehumor'
2624
2625     def report_manifest(self, video_id):
2626         """Report information extraction."""
2627         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2628
2629     def report_extraction(self, video_id):
2630         """Report information extraction."""
2631         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2632
2633     def _real_extract(self, url):
2634         mobj = re.match(self._VALID_URL, url)
2635         if mobj is None:
2636             self._downloader.report_error(u'invalid URL: %s' % url)
2637             return
2638         video_id = mobj.group('videoid')
2639
2640         info = {
2641             'id': video_id,
2642             'uploader': None,
2643             'upload_date': None,
2644         }
2645
2646         self.report_extraction(video_id)
2647         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2648         try:
2649             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2650         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2652             return
2653
2654         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2655         try:
2656             videoNode = mdoc.findall('./video')[0]
2657             info['description'] = videoNode.findall('./description')[0].text
2658             info['title'] = videoNode.findall('./caption')[0].text
2659             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2660             manifest_url = videoNode.findall('./file')[0].text
2661         except IndexError:
2662             self._downloader.report_error(u'Invalid metadata XML file')
2663             return
2664
2665         manifest_url += '?hdcore=2.10.3'
2666         self.report_manifest(video_id)
2667         try:
2668             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2669         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2670             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2671             return
2672
2673         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2674         try:
2675             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2676             node_id = media_node.attrib['url']
2677             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2678         except IndexError as err:
2679             self._downloader.report_error(u'Invalid manifest file')
2680             return
2681
2682         url_pr = compat_urllib_parse_urlparse(manifest_url)
2683         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2684
2685         info['url'] = url
2686         info['ext'] = 'f4f'
2687         return [info]
2688
2689
2690 class XVideosIE(InfoExtractor):
2691     """Information extractor for xvideos.com"""
2692
2693     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2694     IE_NAME = u'xvideos'
2695
2696     def report_extraction(self, video_id):
2697         """Report information extraction."""
2698         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2699
2700     def _real_extract(self, url):
2701         mobj = re.match(self._VALID_URL, url)
2702         if mobj is None:
2703             self._downloader.report_error(u'invalid URL: %s' % url)
2704             return
2705         video_id = mobj.group(1)
2706
2707         webpage = self._download_webpage(url, video_id)
2708
2709         self.report_extraction(video_id)
2710
2711
2712         # Extract video URL
2713         mobj = re.search(r'flv_url=(.+?)&', webpage)
2714         if mobj is None:
2715             self._downloader.report_error(u'unable to extract video url')
2716             return
2717         video_url = compat_urllib_parse.unquote(mobj.group(1))
2718
2719
2720         # Extract title
2721         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2722         if mobj is None:
2723             self._downloader.report_error(u'unable to extract video title')
2724             return
2725         video_title = mobj.group(1)
2726
2727
2728         # Extract video thumbnail
2729         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2730         if mobj is None:
2731             self._downloader.report_error(u'unable to extract video thumbnail')
2732             return
2733         video_thumbnail = mobj.group(0)
2734
2735         info = {
2736             'id': video_id,
2737             'url': video_url,
2738             'uploader': None,
2739             'upload_date': None,
2740             'title': video_title,
2741             'ext': 'flv',
2742             'thumbnail': video_thumbnail,
2743             'description': None,
2744         }
2745
2746         return [info]
2747
2748
2749 class SoundcloudIE(InfoExtractor):
2750     """Information extractor for soundcloud.com
2751        To access the media, the uid of the song and a stream token
2752        must be extracted from the page source and the script must make
2753        a request to media.soundcloud.com/crossdomain.xml. Then
2754        the media can be grabbed by requesting from an url composed
2755        of the stream token and uid
2756      """
2757
2758     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2759     IE_NAME = u'soundcloud'
2760
2761     def __init__(self, downloader=None):
2762         InfoExtractor.__init__(self, downloader)
2763
2764     def report_resolve(self, video_id):
2765         """Report information extraction."""
2766         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2767
2768     def report_extraction(self, video_id):
2769         """Report information extraction."""
2770         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2771
2772     def _real_extract(self, url):
2773         mobj = re.match(self._VALID_URL, url)
2774         if mobj is None:
2775             self._downloader.report_error(u'invalid URL: %s' % url)
2776             return
2777
2778         # extract uploader (which is in the url)
2779         uploader = mobj.group(1)
2780         # extract simple title (uploader + slug of song title)
2781         slug_title =  mobj.group(2)
2782         simple_title = uploader + u'-' + slug_title
2783
2784         self.report_resolve('%s/%s' % (uploader, slug_title))
2785
2786         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2787         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2788         request = compat_urllib_request.Request(resolv_url)
2789         try:
2790             info_json_bytes = compat_urllib_request.urlopen(request).read()
2791             info_json = info_json_bytes.decode('utf-8')
2792         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2794             return
2795
2796         info = json.loads(info_json)
2797         video_id = info['id']
2798         self.report_extraction('%s/%s' % (uploader, slug_title))
2799
2800         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2801         request = compat_urllib_request.Request(streams_url)
2802         try:
2803             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2804             stream_json = stream_json_bytes.decode('utf-8')
2805         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2806             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2807             return
2808
2809         streams = json.loads(stream_json)
2810         mediaURL = streams['http_mp3_128_url']
2811
2812         return [{
2813             'id':       info['id'],
2814             'url':      mediaURL,
2815             'uploader': info['user']['username'],
2816             'upload_date':  info['created_at'],
2817             'title':    info['title'],
2818             'ext':      u'mp3',
2819             'description': info['description'],
2820         }]
2821
2822 class SoundcloudSetIE(InfoExtractor):
2823     """Information extractor for soundcloud.com sets
2824        To access the media, the uid of the song and a stream token
2825        must be extracted from the page source and the script must make
2826        a request to media.soundcloud.com/crossdomain.xml. Then
2827        the media can be grabbed by requesting from an url composed
2828        of the stream token and uid
2829      """
2830
2831     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2832     IE_NAME = u'soundcloud'
2833
2834     def __init__(self, downloader=None):
2835         InfoExtractor.__init__(self, downloader)
2836
2837     def report_resolve(self, video_id):
2838         """Report information extraction."""
2839         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2840
2841     def report_extraction(self, video_id):
2842         """Report information extraction."""
2843         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2844
2845     def _real_extract(self, url):
2846         mobj = re.match(self._VALID_URL, url)
2847         if mobj is None:
2848             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849             return
2850
2851         # extract uploader (which is in the url)
2852         uploader = mobj.group(1)
2853         # extract simple title (uploader + slug of song title)
2854         slug_title =  mobj.group(2)
2855         simple_title = uploader + u'-' + slug_title
2856
2857         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2858
2859         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2860         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2861         request = compat_urllib_request.Request(resolv_url)
2862         try:
2863             info_json_bytes = compat_urllib_request.urlopen(request).read()
2864             info_json = info_json_bytes.decode('utf-8')
2865         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2867             return
2868
2869         videos = []
2870         info = json.loads(info_json)
2871         if 'errors' in info:
2872             for err in info['errors']:
2873                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2874             return
2875
2876         for track in info['tracks']:
2877             video_id = track['id']
2878             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2879
2880             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2881             request = compat_urllib_request.Request(streams_url)
2882             try:
2883                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2884                 stream_json = stream_json_bytes.decode('utf-8')
2885             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2887                 return
2888
2889             streams = json.loads(stream_json)
2890             mediaURL = streams['http_mp3_128_url']
2891
2892             videos.append({
2893                 'id':       video_id,
2894                 'url':      mediaURL,
2895                 'uploader': track['user']['username'],
2896                 'upload_date':  track['created_at'],
2897                 'title':    track['title'],
2898                 'ext':      u'mp3',
2899                 'description': track['description'],
2900             })
2901         return videos
2902
2903
2904 class InfoQIE(InfoExtractor):
2905     """Information extractor for infoq.com"""
2906     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2907
2908     def report_extraction(self, video_id):
2909         """Report information extraction."""
2910         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2911
2912     def _real_extract(self, url):
2913         mobj = re.match(self._VALID_URL, url)
2914         if mobj is None:
2915             self._downloader.report_error(u'invalid URL: %s' % url)
2916             return
2917
2918         webpage = self._download_webpage(url, video_id=url)
2919         self.report_extraction(url)
2920
2921         # Extract video URL
2922         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2923         if mobj is None:
2924             self._downloader.report_error(u'unable to extract video url')
2925             return
2926         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2927         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2928
2929         # Extract title
2930         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2931         if mobj is None:
2932             self._downloader.report_error(u'unable to extract video title')
2933             return
2934         video_title = mobj.group(1)
2935
2936         # Extract description
2937         video_description = u'No description available.'
2938         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2939         if mobj is not None:
2940             video_description = mobj.group(1)
2941
2942         video_filename = video_url.split('/')[-1]
2943         video_id, extension = video_filename.split('.')
2944
2945         info = {
2946             'id': video_id,
2947             'url': video_url,
2948             'uploader': None,
2949             'upload_date': None,
2950             'title': video_title,
2951             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2952             'thumbnail': None,
2953             'description': video_description,
2954         }
2955
2956         return [info]
2957
2958 class MixcloudIE(InfoExtractor):
2959     """Information extractor for www.mixcloud.com"""
2960
2961     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2962     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2963     IE_NAME = u'mixcloud'
2964
2965     def __init__(self, downloader=None):
2966         InfoExtractor.__init__(self, downloader)
2967
2968     def report_download_json(self, file_id):
2969         """Report JSON download."""
2970         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2971
2972     def report_extraction(self, file_id):
2973         """Report information extraction."""
2974         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2975
2976     def get_urls(self, jsonData, fmt, bitrate='best'):
2977         """Get urls from 'audio_formats' section in json"""
2978         file_url = None
2979         try:
2980             bitrate_list = jsonData[fmt]
2981             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2982                 bitrate = max(bitrate_list) # select highest
2983
2984             url_list = jsonData[fmt][bitrate]
2985         except TypeError: # we have no bitrate info.
2986             url_list = jsonData[fmt]
2987         return url_list
2988
2989     def check_urls(self, url_list):
2990         """Returns 1st active url from list"""
2991         for url in url_list:
2992             try:
2993                 compat_urllib_request.urlopen(url)
2994                 return url
2995             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2996                 url = None
2997
2998         return None
2999
3000     def _print_formats(self, formats):
3001         print('Available formats:')
3002         for fmt in formats.keys():
3003             for b in formats[fmt]:
3004                 try:
3005                     ext = formats[fmt][b][0]
3006                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3007                 except TypeError: # we have no bitrate info
3008                     ext = formats[fmt][0]
3009                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3010                     break
3011
3012     def _real_extract(self, url):
3013         mobj = re.match(self._VALID_URL, url)
3014         if mobj is None:
3015             self._downloader.report_error(u'invalid URL: %s' % url)
3016             return
3017         # extract uploader & filename from url
3018         uploader = mobj.group(1).decode('utf-8')
3019         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3020
3021         # construct API request
3022         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3023         # retrieve .json file with links to files
3024         request = compat_urllib_request.Request(file_url)
3025         try:
3026             self.report_download_json(file_url)
3027             jsonData = compat_urllib_request.urlopen(request).read()
3028         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3029             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3030             return
3031
3032         # parse JSON
3033         json_data = json.loads(jsonData)
3034         player_url = json_data['player_swf_url']
3035         formats = dict(json_data['audio_formats'])
3036
3037         req_format = self._downloader.params.get('format', None)
3038         bitrate = None
3039
3040         if self._downloader.params.get('listformats', None):
3041             self._print_formats(formats)
3042             return
3043
3044         if req_format is None or req_format == 'best':
3045             for format_param in formats.keys():
3046                 url_list = self.get_urls(formats, format_param)
3047                 # check urls
3048                 file_url = self.check_urls(url_list)
3049                 if file_url is not None:
3050                     break # got it!
3051         else:
3052             if req_format not in formats:
3053                 self._downloader.report_error(u'format is not available')
3054                 return
3055
3056             url_list = self.get_urls(formats, req_format)
3057             file_url = self.check_urls(url_list)
3058             format_param = req_format
3059
3060         return [{
3061             'id': file_id.decode('utf-8'),
3062             'url': file_url.decode('utf-8'),
3063             'uploader': uploader.decode('utf-8'),
3064             'upload_date': None,
3065             'title': json_data['name'],
3066             'ext': file_url.split('.')[-1].decode('utf-8'),
3067             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3068             'thumbnail': json_data['thumbnail_url'],
3069             'description': json_data['description'],
3070             'player_url': player_url.decode('utf-8'),
3071         }]
3072
3073 class StanfordOpenClassroomIE(InfoExtractor):
3074     """Information extractor for Stanford's Open ClassRoom"""
3075
3076     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3077     IE_NAME = u'stanfordoc'
3078
3079     def report_download_webpage(self, objid):
3080         """Report information extraction."""
3081         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3082
3083     def report_extraction(self, video_id):
3084         """Report information extraction."""
3085         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3086
3087     def _real_extract(self, url):
3088         mobj = re.match(self._VALID_URL, url)
3089         if mobj is None:
3090             raise ExtractorError(u'Invalid URL: %s' % url)
3091
3092         if mobj.group('course') and mobj.group('video'): # A specific video
3093             course = mobj.group('course')
3094             video = mobj.group('video')
3095             info = {
3096                 'id': course + '_' + video,
3097                 'uploader': None,
3098                 'upload_date': None,
3099             }
3100
3101             self.report_extraction(info['id'])
3102             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3103             xmlUrl = baseUrl + video + '.xml'
3104             try:
3105                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3106             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3108                 return
3109             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3110             try:
3111                 info['title'] = mdoc.findall('./title')[0].text
3112                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3113             except IndexError:
3114                 self._downloader.report_error(u'Invalid metadata XML file')
3115                 return
3116             info['ext'] = info['url'].rpartition('.')[2]
3117             return [info]
3118         elif mobj.group('course'): # A course page
3119             course = mobj.group('course')
3120             info = {
3121                 'id': course,
3122                 'type': 'playlist',
3123                 'uploader': None,
3124                 'upload_date': None,
3125             }
3126
3127             coursepage = self._download_webpage(url, info['id'],
3128                                         note='Downloading course info page',
3129                                         errnote='Unable to download course info page')
3130
3131             m = re.search('<h1>([^<]+)</h1>', coursepage)
3132             if m:
3133                 info['title'] = unescapeHTML(m.group(1))
3134             else:
3135                 info['title'] = info['id']
3136
3137             m = re.search('<description>([^<]+)</description>', coursepage)
3138             if m:
3139                 info['description'] = unescapeHTML(m.group(1))
3140
3141             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3142             info['list'] = [
3143                 {
3144                     'type': 'reference',
3145                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3146                 }
3147                     for vpage in links]
3148             results = []
3149             for entry in info['list']:
3150                 assert entry['type'] == 'reference'
3151                 results += self.extract(entry['url'])
3152             return results
3153         else: # Root page
3154             info = {
3155                 'id': 'Stanford OpenClassroom',
3156                 'type': 'playlist',
3157                 'uploader': None,
3158                 'upload_date': None,
3159             }
3160
3161             self.report_download_webpage(info['id'])
3162             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3163             try:
3164                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3165             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3167                 return
3168
3169             info['title'] = info['id']
3170
3171             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3172             info['list'] = [
3173                 {
3174                     'type': 'reference',
3175                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3176                 }
3177                     for cpage in links]
3178
3179             results = []
3180             for entry in info['list']:
3181                 assert entry['type'] == 'reference'
3182                 results += self.extract(entry['url'])
3183             return results
3184
3185 class MTVIE(InfoExtractor):
3186     """Information extractor for MTV.com"""
3187
3188     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3189     IE_NAME = u'mtv'
3190
3191     def report_extraction(self, video_id):
3192         """Report information extraction."""
3193         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3194
3195     def _real_extract(self, url):
3196         mobj = re.match(self._VALID_URL, url)
3197         if mobj is None:
3198             self._downloader.report_error(u'invalid URL: %s' % url)
3199             return
3200         if not mobj.group('proto'):
3201             url = 'http://' + url
3202         video_id = mobj.group('videoid')
3203
3204         webpage = self._download_webpage(url, video_id)
3205
3206         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3207         if mobj is None:
3208             self._downloader.report_error(u'unable to extract song name')
3209             return
3210         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3212         if mobj is None:
3213             self._downloader.report_error(u'unable to extract performer')
3214             return
3215         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3216         video_title = performer + ' - ' + song_name
3217
3218         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3219         if mobj is None:
3220             self._downloader.report_error(u'unable to mtvn_uri')
3221             return
3222         mtvn_uri = mobj.group(1)
3223
3224         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3225         if mobj is None:
3226             self._downloader.report_error(u'unable to extract content id')
3227             return
3228         content_id = mobj.group(1)
3229
3230         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3231         self.report_extraction(video_id)
3232         request = compat_urllib_request.Request(videogen_url)
3233         try:
3234             metadataXml = compat_urllib_request.urlopen(request).read()
3235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3237             return
3238
3239         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3240         renditions = mdoc.findall('.//rendition')
3241
3242         # For now, always pick the highest quality.
3243         rendition = renditions[-1]
3244
3245         try:
3246             _,_,ext = rendition.attrib['type'].partition('/')
3247             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3248             video_url = rendition.find('./src').text
3249         except KeyError:
3250             self._downloader.trouble('Invalid rendition field.')
3251             return
3252
3253         info = {
3254             'id': video_id,
3255             'url': video_url,
3256             'uploader': performer,
3257             'upload_date': None,
3258             'title': video_title,
3259             'ext': ext,
3260             'format': format,
3261         }
3262
3263         return [info]
3264
3265
3266 class YoukuIE(InfoExtractor):
3267     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3268
3269     def report_download_webpage(self, file_id):
3270         """Report webpage download."""
3271         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3272
3273     def report_extraction(self, file_id):
3274         """Report information extraction."""
3275         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3276
3277     def _gen_sid(self):
3278         nowTime = int(time.time() * 1000)
3279         random1 = random.randint(1000,1998)
3280         random2 = random.randint(1000,9999)
3281
3282         return "%d%d%d" %(nowTime,random1,random2)
3283
3284     def _get_file_ID_mix_string(self, seed):
3285         mixed = []
3286         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3287         seed = float(seed)
3288         for i in range(len(source)):
3289             seed  =  (seed * 211 + 30031 ) % 65536
3290             index  =  math.floor(seed / 65536 * len(source) )
3291             mixed.append(source[int(index)])
3292             source.remove(source[int(index)])
3293         #return ''.join(mixed)
3294         return mixed
3295
3296     def _get_file_id(self, fileId, seed):
3297         mixed = self._get_file_ID_mix_string(seed)
3298         ids = fileId.split('*')
3299         realId = []
3300         for ch in ids:
3301             if ch:
3302                 realId.append(mixed[int(ch)])
3303         return ''.join(realId)
3304
3305     def _real_extract(self, url):
3306         mobj = re.match(self._VALID_URL, url)
3307         if mobj is None:
3308             self._downloader.report_error(u'invalid URL: %s' % url)
3309             return
3310         video_id = mobj.group('ID')
3311
3312         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3313
3314         request = compat_urllib_request.Request(info_url, None, std_headers)
3315         try:
3316             self.report_download_webpage(video_id)
3317             jsondata = compat_urllib_request.urlopen(request).read()
3318         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3320             return
3321
3322         self.report_extraction(video_id)
3323         try:
3324             jsonstr = jsondata.decode('utf-8')
3325             config = json.loads(jsonstr)
3326
3327             video_title =  config['data'][0]['title']
3328             seed = config['data'][0]['seed']
3329
3330             format = self._downloader.params.get('format', None)
3331             supported_format = list(config['data'][0]['streamfileids'].keys())
3332
3333             if format is None or format == 'best':
3334                 if 'hd2' in supported_format:
3335                     format = 'hd2'
3336                 else:
3337                     format = 'flv'
3338                 ext = u'flv'
3339             elif format == 'worst':
3340                 format = 'mp4'
3341                 ext = u'mp4'
3342             else:
3343                 format = 'flv'
3344                 ext = u'flv'
3345
3346
3347             fileid = config['data'][0]['streamfileids'][format]
3348             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3349         except (UnicodeDecodeError, ValueError, KeyError):
3350             self._downloader.report_error(u'unable to extract info section')
3351             return
3352
3353         files_info=[]
3354         sid = self._gen_sid()
3355         fileid = self._get_file_id(fileid, seed)
3356
3357         #column 8,9 of fileid represent the segment number
3358         #fileid[7:9] should be changed
3359         for index, key in enumerate(keys):
3360
3361             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3362             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3363
3364             info = {
3365                 'id': '%s_part%02d' % (video_id, index),
3366                 'url': download_url,
3367                 'uploader': None,
3368                 'upload_date': None,
3369                 'title': video_title,
3370                 'ext': ext,
3371             }
3372             files_info.append(info)
3373
3374         return files_info
3375
3376
3377 class XNXXIE(InfoExtractor):
3378     """Information extractor for xnxx.com"""
3379
3380     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3381     IE_NAME = u'xnxx'
3382     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3383     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3384     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3385
3386     def report_webpage(self, video_id):
3387         """Report information extraction"""
3388         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3389
3390     def report_extraction(self, video_id):
3391         """Report information extraction"""
3392         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3393
3394     def _real_extract(self, url):
3395         mobj = re.match(self._VALID_URL, url)
3396         if mobj is None:
3397             self._downloader.report_error(u'invalid URL: %s' % url)
3398             return
3399         video_id = mobj.group(1)
3400
3401         self.report_webpage(video_id)
3402
3403         # Get webpage content
3404         try:
3405             webpage_bytes = compat_urllib_request.urlopen(url).read()
3406             webpage = webpage_bytes.decode('utf-8')
3407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3408             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3409             return
3410
3411         result = re.search(self.VIDEO_URL_RE, webpage)
3412         if result is None:
3413             self._downloader.report_error(u'unable to extract video url')
3414             return
3415         video_url = compat_urllib_parse.unquote(result.group(1))
3416
3417         result = re.search(self.VIDEO_TITLE_RE, webpage)
3418         if result is None:
3419             self._downloader.report_error(u'unable to extract video title')
3420             return
3421         video_title = result.group(1)
3422
3423         result = re.search(self.VIDEO_THUMB_RE, webpage)
3424         if result is None:
3425             self._downloader.report_error(u'unable to extract video thumbnail')
3426             return
3427         video_thumbnail = result.group(1)
3428
3429         return [{
3430             'id': video_id,
3431             'url': video_url,
3432             'uploader': None,
3433             'upload_date': None,
3434             'title': video_title,
3435             'ext': 'flv',
3436             'thumbnail': video_thumbnail,
3437             'description': None,
3438         }]
3439
3440
3441 class GooglePlusIE(InfoExtractor):
3442     """Information extractor for plus.google.com."""
3443
3444     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3445     IE_NAME = u'plus.google'
3446
3447     def __init__(self, downloader=None):
3448         InfoExtractor.__init__(self, downloader)
3449
3450     def report_extract_entry(self, url):
3451         """Report downloading extry"""
3452         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3453
3454     def report_date(self, upload_date):
3455         """Report downloading extry"""
3456         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3457
3458     def report_uploader(self, uploader):
3459         """Report downloading extry"""
3460         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3461
3462     def report_title(self, video_title):
3463         """Report downloading extry"""
3464         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3465
3466     def report_extract_vid_page(self, video_page):
3467         """Report information extraction."""
3468         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3469
3470     def _real_extract(self, url):
3471         # Extract id from URL
3472         mobj = re.match(self._VALID_URL, url)
3473         if mobj is None:
3474             self._downloader.report_error(u'Invalid URL: %s' % url)
3475             return
3476
3477         post_url = mobj.group(0)
3478         video_id = mobj.group(1)
3479
3480         video_extension = 'flv'
3481
3482         # Step 1, Retrieve post webpage to extract further information
3483         self.report_extract_entry(post_url)
3484         request = compat_urllib_request.Request(post_url)
3485         try:
3486             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3489             return
3490
3491         # Extract update date
3492         upload_date = None
3493         pattern = 'title="Timestamp">(.*?)</a>'
3494         mobj = re.search(pattern, webpage)
3495         if mobj:
3496             upload_date = mobj.group(1)
3497             # Convert timestring to a format suitable for filename
3498             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3499             upload_date = upload_date.strftime('%Y%m%d')
3500         self.report_date(upload_date)
3501
3502         # Extract uploader
3503         uploader = None
3504         pattern = r'rel\="author".*?>(.*?)</a>'
3505         mobj = re.search(pattern, webpage)
3506         if mobj:
3507             uploader = mobj.group(1)
3508         self.report_uploader(uploader)
3509
3510         # Extract title
3511         # Get the first line for title
3512         video_title = u'NA'
3513         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3514         mobj = re.search(pattern, webpage)
3515         if mobj:
3516             video_title = mobj.group(1)
3517         self.report_title(video_title)
3518
3519         # Step 2, Stimulate clicking the image box to launch video
3520         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3521         mobj = re.search(pattern, webpage)
3522         if mobj is None:
3523             self._downloader.report_error(u'unable to extract video page URL')
3524
3525         video_page = mobj.group(1)
3526         request = compat_urllib_request.Request(video_page)
3527         try:
3528             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3529         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3530             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3531             return
3532         self.report_extract_vid_page(video_page)
3533
3534
3535         # Extract video links on video page
3536         """Extract video links of all sizes"""
3537         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3538         mobj = re.findall(pattern, webpage)
3539         if len(mobj) == 0:
3540             self._downloader.report_error(u'unable to extract video links')
3541
3542         # Sort in resolution
3543         links = sorted(mobj)
3544
3545         # Choose the lowest of the sort, i.e. highest resolution
3546         video_url = links[-1]
3547         # Only get the url. The resolution part in the tuple has no use anymore
3548         video_url = video_url[-1]
3549         # Treat escaped \u0026 style hex
3550         try:
3551             video_url = video_url.decode("unicode_escape")
3552         except AttributeError: # Python 3
3553             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3554
3555
3556         return [{
3557             'id':       video_id,
3558             'url':      video_url,
3559             'uploader': uploader,
3560             'upload_date':  upload_date,
3561             'title':    video_title,
3562             'ext':      video_extension,
3563         }]
3564
3565 class NBAIE(InfoExtractor):
3566     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3567     IE_NAME = u'nba'
3568
3569     def _real_extract(self, url):
3570         mobj = re.match(self._VALID_URL, url)
3571         if mobj is None:
3572             self._downloader.report_error(u'invalid URL: %s' % url)
3573             return
3574
3575         video_id = mobj.group(1)
3576         if video_id.endswith('/index.html'):
3577             video_id = video_id[:-len('/index.html')]
3578
3579         webpage = self._download_webpage(url, video_id)
3580
3581         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3582         def _findProp(rexp, default=None):
3583             m = re.search(rexp, webpage)
3584             if m:
3585                 return unescapeHTML(m.group(1))
3586             else:
3587                 return default
3588
3589         shortened_video_id = video_id.rpartition('/')[2]
3590         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3591         info = {
3592             'id': shortened_video_id,
3593             'url': video_url,
3594             'ext': 'mp4',
3595             'title': title,
3596             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3597             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3598         }
3599         return [info]
3600
3601 class JustinTVIE(InfoExtractor):
3602     """Information extractor for justin.tv and twitch.tv"""
3603     # TODO: One broadcast may be split into multiple videos. The key
3604     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3605     # starts at 1 and increases. Can we treat all parts as one video?
3606
3607     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3608         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3609     _JUSTIN_PAGE_LIMIT = 100
3610     IE_NAME = u'justin.tv'
3611
3612     def report_extraction(self, file_id):
3613         """Report information extraction."""
3614         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3615
3616     def report_download_page(self, channel, offset):
3617         """Report attempt to download a single page of videos."""
3618         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3619                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3620
3621     # Return count of items, list of *valid* items
3622     def _parse_page(self, url):
3623         try:
3624             urlh = compat_urllib_request.urlopen(url)
3625             webpage_bytes = urlh.read()
3626             webpage = webpage_bytes.decode('utf-8', 'ignore')
3627         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3628             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3629             return
3630
3631         response = json.loads(webpage)
3632         if type(response) != list:
3633             error_text = response.get('error', 'unknown error')
3634             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3635             return
3636         info = []
3637         for clip in response:
3638             video_url = clip['video_file_url']
3639             if video_url:
3640                 video_extension = os.path.splitext(video_url)[1][1:]
3641                 video_date = re.sub('-', '', clip['start_time'][:10])
3642                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3643                 video_id = clip['id']
3644                 video_title = clip.get('title', video_id)
3645                 info.append({
3646                     'id': video_id,
3647                     'url': video_url,
3648                     'title': video_title,
3649                     'uploader': clip.get('channel_name', video_uploader_id),
3650                     'uploader_id': video_uploader_id,
3651                     'upload_date': video_date,
3652                     'ext': video_extension,
3653                 })
3654         return (len(response), info)
3655
3656     def _real_extract(self, url):
3657         mobj = re.match(self._VALID_URL, url)
3658         if mobj is None:
3659             self._downloader.report_error(u'invalid URL: %s' % url)
3660             return
3661
3662         api = 'http://api.justin.tv'
3663         video_id = mobj.group(mobj.lastindex)
3664         paged = False
3665         if mobj.lastindex == 1:
3666             paged = True
3667             api += '/channel/archives/%s.json'
3668         else:
3669             api += '/broadcast/by_archive/%s.json'
3670         api = api % (video_id,)
3671
3672         self.report_extraction(video_id)
3673
3674         info = []
3675         offset = 0
3676         limit = self._JUSTIN_PAGE_LIMIT
3677         while True:
3678             if paged:
3679                 self.report_download_page(video_id, offset)
3680             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3681             page_count, page_info = self._parse_page(page_url)
3682             info.extend(page_info)
3683             if not paged or page_count != limit:
3684                 break
3685             offset += limit
3686         return info
3687
3688 class FunnyOrDieIE(InfoExtractor):
3689     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3690
3691     def _real_extract(self, url):
3692         mobj = re.match(self._VALID_URL, url)
3693         if mobj is None:
3694             self._downloader.report_error(u'invalid URL: %s' % url)
3695             return
3696
3697         video_id = mobj.group('id')
3698         webpage = self._download_webpage(url, video_id)
3699
3700         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3701         if not m:
3702             self._downloader.report_error(u'unable to find video information')
3703         video_url = unescapeHTML(m.group('url'))
3704
3705         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3706         if not m:
3707             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3708             if not m:
3709                 self._downloader.trouble(u'Cannot find video title')
3710         title = clean_html(m.group('title'))
3711
3712         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3713         if m:
3714             desc = unescapeHTML(m.group('desc'))
3715         else:
3716             desc = None
3717
3718         info = {
3719             'id': video_id,
3720             'url': video_url,
3721             'ext': 'mp4',
3722             'title': title,
3723             'description': desc,
3724         }
3725         return [info]
3726
3727 class SteamIE(InfoExtractor):
3728     _VALID_URL = r"""http://store.steampowered.com/
3729                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3730                 (?P<gameID>\d+)/?
3731                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3732                 """
3733
3734     @classmethod
3735     def suitable(cls, url):
3736         """Receives a URL and returns True if suitable for this IE."""
3737         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3738
3739     def _real_extract(self, url):
3740         m = re.match(self._VALID_URL, url, re.VERBOSE)
3741         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3742         gameID = m.group('gameID')
3743         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3744         webpage = self._download_webpage(videourl, gameID)
3745         mweb = re.finditer(urlRE, webpage)
3746         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3747         titles = re.finditer(namesRE, webpage)
3748         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3749         thumbs = re.finditer(thumbsRE, webpage)
3750         videos = []
3751         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3752             video_id = vid.group('videoID')
3753             title = vtitle.group('videoName')
3754             video_url = vid.group('videoURL')
3755             video_thumb = thumb.group('thumbnail')
3756             if not video_url:
3757                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3758             info = {
3759                 'id':video_id,
3760                 'url':video_url,
3761                 'ext': 'flv',
3762                 'title': unescapeHTML(title),
3763                 'thumbnail': video_thumb
3764                   }
3765             videos.append(info)
3766         return videos
3767
3768 class UstreamIE(InfoExtractor):
3769     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3770     IE_NAME = u'ustream'
3771
3772     def _real_extract(self, url):
3773         m = re.match(self._VALID_URL, url)
3774         video_id = m.group('videoID')
3775         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3776         webpage = self._download_webpage(url, video_id)
3777         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3778         title = m.group('title')
3779         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3780         uploader = m.group('uploader')
3781         info = {
3782                 'id':video_id,
3783                 'url':video_url,
3784                 'ext': 'flv',
3785                 'title': title,
3786                 'uploader': uploader
3787                   }
3788         return [info]
3789
3790 class WorldStarHipHopIE(InfoExtractor):
3791     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3792     IE_NAME = u'WorldStarHipHop'
3793
3794     def _real_extract(self, url):
3795         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3796
3797         webpage_src = compat_urllib_request.urlopen(url).read()
3798         webpage_src = webpage_src.decode('utf-8')
3799
3800         mobj = re.search(_src_url, webpage_src)
3801
3802         m = re.match(self._VALID_URL, url)
3803         video_id = m.group('id')
3804
3805         if mobj is not None:
3806             video_url = mobj.group()
3807             if 'mp4' in video_url:
3808                 ext = 'mp4'
3809             else:
3810                 ext = 'flv'
3811         else:
3812             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3813             return
3814
3815         _title = r"""<title>(.*)</title>"""
3816
3817         mobj = re.search(_title, webpage_src)
3818
3819         if mobj is not None:
3820             title = mobj.group(1)
3821         else:
3822             title = 'World Start Hip Hop - %s' % time.ctime()
3823
3824         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3825         mobj = re.search(_thumbnail, webpage_src)
3826
3827         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3828         if mobj is not None:
3829             thumbnail = mobj.group(1)
3830         else:
3831             _title = r"""candytitles.*>(.*)</span>"""
3832             mobj = re.search(_title, webpage_src)
3833             if mobj is not None:
3834                 title = mobj.group(1)
3835             thumbnail = None
3836
3837         results = [{
3838                     'id': video_id,
3839                     'url' : video_url,
3840                     'title' : title,
3841                     'thumbnail' : thumbnail,
3842                     'ext' : ext,
3843                     }]
3844         return results
3845
3846 class RBMARadioIE(InfoExtractor):
3847     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3848
3849     def _real_extract(self, url):
3850         m = re.match(self._VALID_URL, url)
3851         video_id = m.group('videoID')
3852
3853         webpage = self._download_webpage(url, video_id)
3854         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3855         if not m:
3856             raise ExtractorError(u'Cannot find metadata')
3857         json_data = m.group(1)
3858
3859         try:
3860             data = json.loads(json_data)
3861         except ValueError as e:
3862             raise ExtractorError(u'Invalid JSON: ' + str(e))
3863
3864         video_url = data['akamai_url'] + '&cbr=256'
3865         url_parts = compat_urllib_parse_urlparse(video_url)
3866         video_ext = url_parts.path.rpartition('.')[2]
3867         info = {
3868                 'id': video_id,
3869                 'url': video_url,
3870                 'ext': video_ext,
3871                 'title': data['title'],
3872                 'description': data.get('teaser_text'),
3873                 'location': data.get('country_of_origin'),
3874                 'uploader': data.get('host', {}).get('name'),
3875                 'uploader_id': data.get('host', {}).get('slug'),
3876                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3877                 'duration': data.get('duration'),
3878         }
3879         return [info]
3880
3881
3882 class YouPornIE(InfoExtractor):
3883     """Information extractor for youporn.com."""
3884     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3885
3886     def _print_formats(self, formats):
3887         """Print all available formats"""
3888         print(u'Available formats:')
3889         print(u'ext\t\tformat')
3890         print(u'---------------------------------')
3891         for format in formats:
3892             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3893
3894     def _specific(self, req_format, formats):
3895         for x in formats:
3896             if(x["format"]==req_format):
3897                 return x
3898         return None
3899
3900     def _real_extract(self, url):
3901         mobj = re.match(self._VALID_URL, url)
3902         if mobj is None:
3903             self._downloader.report_error(u'invalid URL: %s' % url)
3904             return
3905
3906         video_id = mobj.group('videoid')
3907
3908         req = compat_urllib_request.Request(url)
3909         req.add_header('Cookie', 'age_verified=1')
3910         webpage = self._download_webpage(req, video_id)
3911
3912         # Get the video title
3913         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3914         if result is None:
3915             raise ExtractorError(u'Unable to extract video title')
3916         video_title = result.group('title').strip()
3917
3918         # Get the video date
3919         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3920         if result is None:
3921             self._downloader.report_warning(u'unable to extract video date')
3922             upload_date = None
3923         else:
3924             upload_date = result.group('date').strip()
3925
3926         # Get the video uploader
3927         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3928         if result is None:
3929             self._downloader.report_warning(u'unable to extract uploader')
3930             video_uploader = None
3931         else:
3932             video_uploader = result.group('uploader').strip()
3933             video_uploader = clean_html( video_uploader )
3934
3935         # Get all of the formats available
3936         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3937         result = re.search(DOWNLOAD_LIST_RE, webpage)
3938         if result is None:
3939             raise ExtractorError(u'Unable to extract download list')
3940         download_list_html = result.group('download_list').strip()
3941
3942         # Get all of the links from the page
3943         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3944         links = re.findall(LINK_RE, download_list_html)
3945         if(len(links) == 0):
3946             raise ExtractorError(u'ERROR: no known formats available for video')
3947
3948         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3949
3950         formats = []
3951         for link in links:
3952
3953             # A link looks like this:
3954             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3955             # A path looks like this:
3956             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3957             video_url = unescapeHTML( link )
3958             path = compat_urllib_parse_urlparse( video_url ).path
3959             extension = os.path.splitext( path )[1][1:]
3960             format = path.split('/')[4].split('_')[:2]
3961             size = format[0]
3962             bitrate = format[1]
3963             format = "-".join( format )
3964             title = u'%s-%s-%s' % (video_title, size, bitrate)
3965
3966             formats.append({
3967                 'id': video_id,
3968                 'url': video_url,
3969                 'uploader': video_uploader,
3970                 'upload_date': upload_date,
3971                 'title': title,
3972                 'ext': extension,
3973                 'format': format,
3974                 'thumbnail': None,
3975                 'description': None,
3976                 'player_url': None
3977             })
3978
3979         if self._downloader.params.get('listformats', None):
3980             self._print_formats(formats)
3981             return
3982
3983         req_format = self._downloader.params.get('format', None)
3984         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3985
3986         if req_format is None or req_format == 'best':
3987             return [formats[0]]
3988         elif req_format == 'worst':
3989             return [formats[-1]]
3990         elif req_format in ('-1', 'all'):
3991             return formats
3992         else:
3993             format = self._specific( req_format, formats )
3994             if result is None:
3995                 self._downloader.report_error(u'requested format not available')
3996                 return
3997             return [format]
3998
3999
4000
4001 class PornotubeIE(InfoExtractor):
4002     """Information extractor for pornotube.com."""
4003     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4004
4005     def _real_extract(self, url):
4006         mobj = re.match(self._VALID_URL, url)
4007         if mobj is None:
4008             self._downloader.report_error(u'invalid URL: %s' % url)
4009             return
4010
4011         video_id = mobj.group('videoid')
4012         video_title = mobj.group('title')
4013
4014         # Get webpage content
4015         webpage = self._download_webpage(url, video_id)
4016
4017         # Get the video URL
4018         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4019         result = re.search(VIDEO_URL_RE, webpage)
4020         if result is None:
4021             self._downloader.report_error(u'unable to extract video url')
4022             return
4023         video_url = compat_urllib_parse.unquote(result.group('url'))
4024
4025         #Get the uploaded date
4026         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4027         result = re.search(VIDEO_UPLOADED_RE, webpage)
4028         if result is None:
4029             self._downloader.report_error(u'unable to extract video title')
4030             return
4031         upload_date = result.group('date')
4032
4033         info = {'id': video_id,
4034                 'url': video_url,
4035                 'uploader': None,
4036                 'upload_date': upload_date,
4037                 'title': video_title,
4038                 'ext': 'flv',
4039                 'format': 'flv'}
4040
4041         return [info]
4042
4043 class YouJizzIE(InfoExtractor):
4044     """Information extractor for youjizz.com."""
4045     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4046
4047     def _real_extract(self, url):
4048         mobj = re.match(self._VALID_URL, url)
4049         if mobj is None:
4050             self._downloader.report_error(u'invalid URL: %s' % url)
4051             return
4052
4053         video_id = mobj.group('videoid')
4054
4055         # Get webpage content
4056         webpage = self._download_webpage(url, video_id)
4057
4058         # Get the video title
4059         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4060         if result is None:
4061             raise ExtractorError(u'ERROR: unable to extract video title')
4062         video_title = result.group('title').strip()
4063
4064         # Get the embed page
4065         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4066         if result is None:
4067             raise ExtractorError(u'ERROR: unable to extract embed page')
4068
4069         embed_page_url = result.group(0).strip()
4070         video_id = result.group('videoid')
4071
4072         webpage = self._download_webpage(embed_page_url, video_id)
4073
4074         # Get the video URL
4075         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4076         if result is None:
4077             raise ExtractorError(u'ERROR: unable to extract video url')
4078         video_url = result.group('source')
4079
4080         info = {'id': video_id,
4081                 'url': video_url,
4082                 'title': video_title,
4083                 'ext': 'flv',
4084                 'format': 'flv',
4085                 'player_url': embed_page_url}
4086
4087         return [info]
4088
4089 class EightTracksIE(InfoExtractor):
4090     IE_NAME = '8tracks'
4091     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4092
4093     def _real_extract(self, url):
4094         mobj = re.match(self._VALID_URL, url)
4095         if mobj is None:
4096             raise ExtractorError(u'Invalid URL: %s' % url)
4097         playlist_id = mobj.group('id')
4098
4099         webpage = self._download_webpage(url, playlist_id)
4100
4101         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4102         if not m:
4103             raise ExtractorError(u'Cannot find trax information')
4104         json_like = m.group(1)
4105         data = json.loads(json_like)
4106
4107         session = str(random.randint(0, 1000000000))
4108         mix_id = data['id']
4109         track_count = data['tracks_count']
4110         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4111         next_url = first_url
4112         res = []
4113         for i in itertools.count():
4114             api_json = self._download_webpage(next_url, playlist_id,
4115                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4116                 errnote=u'Failed to download song information')
4117             api_data = json.loads(api_json)
4118             track_data = api_data[u'set']['track']
4119             info = {
4120                 'id': track_data['id'],
4121                 'url': track_data['track_file_stream_url'],
4122                 'title': track_data['performer'] + u' - ' + track_data['name'],
4123                 'raw_title': track_data['name'],
4124                 'uploader_id': data['user']['login'],
4125                 'ext': 'm4a',
4126             }
4127             res.append(info)
4128             if api_data['set']['at_last_track']:
4129                 break
4130             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4131         return res
4132
4133 class KeekIE(InfoExtractor):
4134     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4135     IE_NAME = u'keek'
4136
4137     def _real_extract(self, url):
4138         m = re.match(self._VALID_URL, url)
4139         video_id = m.group('videoID')
4140         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4141         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4142         webpage = self._download_webpage(url, video_id)
4143         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4144         title = unescapeHTML(m.group('title'))
4145         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4146         uploader = clean_html(m.group('uploader'))
4147         info = {
4148                 'id': video_id,
4149                 'url': video_url,
4150                 'ext': 'mp4',
4151                 'title': title,
4152                 'thumbnail': thumbnail,
4153                 'uploader': uploader
4154         }
4155         return [info]
4156
4157 class TEDIE(InfoExtractor):
4158     _VALID_URL=r'''http://www.ted.com/
4159                    (
4160                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4161                         |
4162                         ((?P<type_talk>talks)) # We have a simple talk
4163                    )
4164                    /(?P<name>\w+) # Here goes the name and then ".html"
4165                    '''
4166
4167     @classmethod
4168     def suitable(cls, url):
4169         """Receives a URL and returns True if suitable for this IE."""
4170         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4171
4172     def _real_extract(self, url):
4173         m=re.match(self._VALID_URL, url, re.VERBOSE)
4174         if m.group('type_talk'):
4175             return [self._talk_info(url)]
4176         else :
4177             playlist_id=m.group('playlist_id')
4178             name=m.group('name')
4179             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4180             return self._playlist_videos_info(url,name,playlist_id)
4181
4182     def _talk_video_link(self,mediaSlug):
4183         '''Returns the video link for that mediaSlug'''
4184         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4185
4186     def _playlist_videos_info(self,url,name,playlist_id=0):
4187         '''Returns the videos of the playlist'''
4188         video_RE=r'''
4189                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4190                      ([.\s]*?)data-playlist_item_id="(\d+)"
4191                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4192                      '''
4193         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4194         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4195         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4196         m_names=re.finditer(video_name_RE,webpage)
4197         info=[]
4198         for m_video, m_name in zip(m_videos,m_names):
4199             video_id=m_video.group('video_id')
4200             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4201             info.append(self._talk_info(talk_url,video_id))
4202         return info
4203
4204     def _talk_info(self, url, video_id=0):
4205         """Return the video for the talk in the url"""
4206         m=re.match(self._VALID_URL, url,re.VERBOSE)
4207         videoName=m.group('name')
4208         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4209         # If the url includes the language we get the title translated
4210         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4211         title=re.search(title_RE, webpage).group('title')
4212         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4213                         "id":(?P<videoID>[\d]+).*?
4214                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4215         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4216         thumb_match=re.search(thumb_RE,webpage)
4217         info_match=re.search(info_RE,webpage,re.VERBOSE)
4218         video_id=info_match.group('videoID')
4219         mediaSlug=info_match.group('mediaSlug')
4220         video_url=self._talk_video_link(mediaSlug)
4221         info = {
4222                 'id': video_id,
4223                 'url': video_url,
4224                 'ext': 'mp4',
4225                 'title': title,
4226                 'thumbnail': thumb_match.group('thumbnail')
4227                 }
4228         return info
4229
4230 class MySpassIE(InfoExtractor):
4231     _VALID_URL = r'http://www.myspass.de/.*'
4232
4233     def _real_extract(self, url):
4234         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4235
4236         # video id is the last path element of the URL
4237         # usually there is a trailing slash, so also try the second but last
4238         url_path = compat_urllib_parse_urlparse(url).path
4239         url_parent_path, video_id = os.path.split(url_path)
4240         if not video_id:
4241             _, video_id = os.path.split(url_parent_path)
4242
4243         # get metadata
4244         metadata_url = META_DATA_URL_TEMPLATE % video_id
4245         metadata_text = self._download_webpage(metadata_url, video_id)
4246         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4247
4248         # extract values from metadata
4249         url_flv_el = metadata.find('url_flv')
4250         if url_flv_el is None:
4251             self._downloader.report_error(u'unable to extract download url')
4252             return
4253         video_url = url_flv_el.text
4254         extension = os.path.splitext(video_url)[1][1:]
4255         title_el = metadata.find('title')
4256         if title_el is None:
4257             self._downloader.report_error(u'unable to extract title')
4258             return
4259         title = title_el.text
4260         format_id_el = metadata.find('format_id')
4261         if format_id_el is None:
4262             format = ext
4263         else:
4264             format = format_id_el.text
4265         description_el = metadata.find('description')
4266         if description_el is not None:
4267             description = description_el.text
4268         else:
4269             description = None
4270         imagePreview_el = metadata.find('imagePreview')
4271         if imagePreview_el is not None:
4272             thumbnail = imagePreview_el.text
4273         else:
4274             thumbnail = None
4275         info = {
4276             'id': video_id,
4277             'url': video_url,
4278             'title': title,
4279             'ext': extension,
4280             'format': format,
4281             'thumbnail': thumbnail,
4282             'description': description
4283         }
4284         return [info]
4285
4286 class SpiegelIE(InfoExtractor):
4287     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4288
4289     def _real_extract(self, url):
4290         m = re.match(self._VALID_URL, url)
4291         video_id = m.group('videoID')
4292
4293         webpage = self._download_webpage(url, video_id)
4294         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4295         if not m:
4296             raise ExtractorError(u'Cannot find title')
4297         video_title = unescapeHTML(m.group(1))
4298
4299         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4300         xml_code = self._download_webpage(xml_url, video_id,
4301                     note=u'Downloading XML', errnote=u'Failed to download XML')
4302
4303         idoc = xml.etree.ElementTree.fromstring(xml_code)
4304         last_type = idoc[-1]
4305         filename = last_type.findall('./filename')[0].text
4306         duration = float(last_type.findall('./duration')[0].text)
4307
4308         video_url = 'http://video2.spiegel.de/flash/' + filename
4309         video_ext = filename.rpartition('.')[2]
4310         info = {
4311             'id': video_id,
4312             'url': video_url,
4313             'ext': video_ext,
4314             'title': video_title,
4315             'duration': duration,
4316         }
4317         return [info]
4318
4319 class LiveLeakIE(InfoExtractor):
4320
4321     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4322     IE_NAME = u'liveleak'
4323
4324     def _real_extract(self, url):
4325         mobj = re.match(self._VALID_URL, url)
4326         if mobj is None:
4327             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4328             return
4329
4330         video_id = mobj.group('video_id')
4331
4332         webpage = self._download_webpage(url, video_id)
4333
4334         m = re.search(r'file: "(.*?)",', webpage)
4335         if not m:
4336             self._downloader.report_error(u'unable to find video url')
4337             return
4338         video_url = m.group(1)
4339
4340         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4341         if not m:
4342             self._downloader.trouble(u'Cannot find video title')
4343         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4344
4345         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4346         if m:
4347             desc = unescapeHTML(m.group('desc'))
4348         else:
4349             desc = None
4350
4351         m = re.search(r'By:.*?(\w+)</a>', webpage)
4352         if m:
4353             uploader = clean_html(m.group(1))
4354         else:
4355             uploader = None
4356
4357         info = {
4358             'id':  video_id,
4359             'url': video_url,
4360             'ext': 'mp4',
4361             'title': title,
4362             'description': desc,
4363             'uploader': uploader
4364         }
4365
4366         return [info]
4367
4368 class ARDIE(InfoExtractor):
4369     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4370     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4371     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4372
4373     def _real_extract(self, url):
4374         # determine video id from url
4375         m = re.match(self._VALID_URL, url)
4376
4377         numid = re.search(r'documentId=([0-9]+)', url)
4378         if numid:
4379             video_id = numid.group(1)
4380         else:
4381             video_id = m.group('video_id')
4382
4383         # determine title and media streams from webpage
4384         html = self._download_webpage(url, video_id)
4385         title = re.search(self._TITLE, html).group('title')
4386         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4387         if not streams:
4388             assert '"fsk"' in html
4389             self._downloader.report_error(u'this video is only available after 8:00 pm')
4390             return
4391
4392         # choose default media type and highest quality for now
4393         stream = max([s for s in streams if int(s["media_type"]) == 0],
4394                      key=lambda s: int(s["quality"]))
4395
4396         # there's two possibilities: RTMP stream or HTTP download
4397         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4398         if stream['rtmp_url']:
4399             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4400             assert stream['video_url'].startswith('mp4:')
4401             info["url"] = stream["rtmp_url"]
4402             info["play_path"] = stream['video_url']
4403         else:
4404             assert stream["video_url"].endswith('.mp4')
4405             info["url"] = stream["video_url"]
4406         return [info]
4407
4408
4409 def gen_extractors():
4410     """ Return a list of an instance of every supported extractor.
4411     The order does matter; the first extractor matched is the one handling the URL.
4412     """
4413     return [
4414         YoutubePlaylistIE(),
4415         YoutubeChannelIE(),
4416         YoutubeUserIE(),
4417         YoutubeSearchIE(),
4418         YoutubeIE(),
4419         MetacafeIE(),
4420         DailymotionIE(),
4421         GoogleSearchIE(),
4422         PhotobucketIE(),
4423         YahooIE(),
4424         YahooSearchIE(),
4425         DepositFilesIE(),
4426         FacebookIE(),
4427         BlipTVUserIE(),
4428         BlipTVIE(),
4429         VimeoIE(),
4430         MyVideoIE(),
4431         ComedyCentralIE(),
4432         EscapistIE(),
4433         CollegeHumorIE(),
4434         XVideosIE(),
4435         SoundcloudSetIE(),
4436         SoundcloudIE(),
4437         InfoQIE(),
4438         MixcloudIE(),
4439         StanfordOpenClassroomIE(),
4440         MTVIE(),
4441         YoukuIE(),
4442         XNXXIE(),
4443         YouJizzIE(),
4444         PornotubeIE(),
4445         YouPornIE(),
4446         GooglePlusIE(),
4447         ArteTvIE(),
4448         NBAIE(),
4449         WorldStarHipHopIE(),
4450         JustinTVIE(),
4451         FunnyOrDieIE(),
4452         SteamIE(),
4453         UstreamIE(),
4454         RBMARadioIE(),
4455         EightTracksIE(),
4456         KeekIE(),
4457         TEDIE(),
4458         MySpassIE(),
4459         SpiegelIE(),
4460         LiveLeakIE(),
4461         ARDIE(),
4462         GenericIE()
4463     ]