Fix playlists with size 50i āˆ€ iāˆ‰ā„• (Closes #782)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self._downloader.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147
148 class YoutubeIE(InfoExtractor):
149     """Information extractor for youtube.com."""
150
151     _VALID_URL = r"""^
152                      (
153                          (?:https?://)?                                       # http(s):// (optional)
154                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
156                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
157                          (?:                                                  # the various things that can precede the ID:
158                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
159                              |(?:                                             # or the v= param in all its forms
160                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
162                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
163                                  v=
164                              )
165                          )?                                                   # optional -> youtube.com/xxxx is OK
166                      )?                                                       # all until now is optional -> you can pass the naked ID
167                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
168                      (?(1).+)?                                                # if we found the ID, everything can follow
169                      $"""
170     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
171     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
172     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174     _NETRC_MACHINE = 'youtube'
175     # Listed in order of quality
176     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178     _video_extensions = {
179         '13': '3gp',
180         '17': 'mp4',
181         '18': 'mp4',
182         '22': 'mp4',
183         '37': 'mp4',
184         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
185         '43': 'webm',
186         '44': 'webm',
187         '45': 'webm',
188         '46': 'webm',
189     }
190     _video_dimensions = {
191         '5': '240x400',
192         '6': '???',
193         '13': '???',
194         '17': '144x176',
195         '18': '360x640',
196         '22': '720x1280',
197         '34': '360x640',
198         '35': '480x854',
199         '37': '1080x1920',
200         '38': '3072x4096',
201         '43': '360x640',
202         '44': '480x854',
203         '45': '720x1280',
204         '46': '1080x1920',
205     }
206     IE_NAME = u'youtube'
207
208     @classmethod
209     def suitable(cls, url):
210         """Receives a URL and returns True if suitable for this IE."""
211         if YoutubePlaylistIE.suitable(url): return False
212         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
213
214     def report_lang(self):
215         """Report attempt to set language."""
216         self._downloader.to_screen(u'[youtube] Setting language')
217
218     def report_login(self):
219         """Report attempt to log in."""
220         self._downloader.to_screen(u'[youtube] Logging in')
221
222     def report_age_confirmation(self):
223         """Report attempt to confirm age."""
224         self._downloader.to_screen(u'[youtube] Confirming age')
225
226     def report_video_webpage_download(self, video_id):
227         """Report attempt to download video webpage."""
228         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
229
230     def report_video_info_webpage_download(self, video_id):
231         """Report attempt to download video info webpage."""
232         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
233
234     def report_video_subtitles_download(self, video_id):
235         """Report attempt to download video info webpage."""
236         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
237
238     def report_video_subtitles_request(self, video_id, sub_lang, format):
239         """Report attempt to download video info webpage."""
240         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
241
242     def report_video_subtitles_available(self, video_id, sub_lang_list):
243         """Report available subtitles."""
244         sub_lang = ",".join(list(sub_lang_list.keys()))
245         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
246
247     def report_information_extraction(self, video_id):
248         """Report attempt to extract video information."""
249         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
250
251     def report_unavailable_format(self, video_id, format):
252         """Report extracted video URL."""
253         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
254
255     def report_rtmp_download(self):
256         """Indicate the download will use the RTMP protocol."""
257         self._downloader.to_screen(u'[youtube] RTMP download detected')
258
259     def _get_available_subtitles(self, video_id):
260         self.report_video_subtitles_download(video_id)
261         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
262         try:
263             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
265             return (u'unable to download video subtitles: %s' % compat_str(err), None)
266         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268         if not sub_lang_list:
269             return (u'video doesn\'t have subtitles', None)
270         return sub_lang_list
271
272     def _list_available_subtitles(self, video_id):
273         sub_lang_list = self._get_available_subtitles(video_id)
274         self.report_video_subtitles_available(video_id, sub_lang_list)
275
276     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
277         """
278         Return tuple:
279         (error_message, sub_lang, sub)
280         """
281         self.report_video_subtitles_request(video_id, sub_lang, format)
282         params = compat_urllib_parse.urlencode({
283             'lang': sub_lang,
284             'name': sub_name,
285             'v': video_id,
286             'fmt': format,
287         })
288         url = 'http://www.youtube.com/api/timedtext?' + params
289         try:
290             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
293         if not sub:
294             return (u'Did not fetch video subtitles', None, None)
295         return (None, sub_lang, sub)
296
297     def _extract_subtitle(self, video_id):
298         """
299         Return a list with a tuple:
300         [(error_message, sub_lang, sub)]
301         """
302         sub_lang_list = self._get_available_subtitles(video_id)
303         sub_format = self._downloader.params.get('subtitlesformat')
304         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305             return [(sub_lang_list[0], None, None)]
306         if self._downloader.params.get('subtitleslang', False):
307             sub_lang = self._downloader.params.get('subtitleslang')
308         elif 'en' in sub_lang_list:
309             sub_lang = 'en'
310         else:
311             sub_lang = list(sub_lang_list.keys())[0]
312         if not sub_lang in sub_lang_list:
313             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
314
315         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
316         return [subtitle]
317
318     def _extract_all_subtitles(self, video_id):
319         sub_lang_list = self._get_available_subtitles(video_id)
320         sub_format = self._downloader.params.get('subtitlesformat')
321         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322             return [(sub_lang_list[0], None, None)]
323         subtitles = []
324         for sub_lang in sub_lang_list:
325             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
326             subtitles.append(subtitle)
327         return subtitles
328
329     def _print_formats(self, formats):
330         print('Available formats:')
331         for x in formats:
332             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
333
334     def _real_initialize(self):
335         if self._downloader is None:
336             return
337
338         username = None
339         password = None
340         downloader_params = self._downloader.params
341
342         # Attempt to use provided username and password or .netrc data
343         if downloader_params.get('username', None) is not None:
344             username = downloader_params['username']
345             password = downloader_params['password']
346         elif downloader_params.get('usenetrc', False):
347             try:
348                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
349                 if info is not None:
350                     username = info[0]
351                     password = info[2]
352                 else:
353                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354             except (IOError, netrc.NetrcParseError) as err:
355                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
356                 return
357
358         # Set language
359         request = compat_urllib_request.Request(self._LANG_URL)
360         try:
361             self.report_lang()
362             compat_urllib_request.urlopen(request).read()
363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
365             return
366
367         # No authentication to be performed
368         if username is None:
369             return
370
371         request = compat_urllib_request.Request(self._LOGIN_URL)
372         try:
373             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
376             return
377
378         galx = None
379         dsh = None
380         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
381         if match:
382           galx = match.group(1)
383
384         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
385         if match:
386           dsh = match.group(1)
387
388         # Log in
389         login_form_strs = {
390                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
391                 u'Email': username,
392                 u'GALX': galx,
393                 u'Passwd': password,
394                 u'PersistentCookie': u'yes',
395                 u'_utf8': u'霱',
396                 u'bgresponse': u'js_disabled',
397                 u'checkConnection': u'',
398                 u'checkedDomains': u'youtube',
399                 u'dnConn': u'',
400                 u'dsh': dsh,
401                 u'pstMsg': u'0',
402                 u'rmShown': u'1',
403                 u'secTok': u'',
404                 u'signIn': u'Sign in',
405                 u'timeStmp': u'',
406                 u'service': u'youtube',
407                 u'uilel': u'3',
408                 u'hl': u'en_US',
409         }
410         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
411         # chokes on unicode
412         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
415         try:
416             self.report_login()
417             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
418             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
419                 self._downloader.report_warning(u'unable to log in: bad username or password')
420                 return
421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
423             return
424
425         # Confirm age
426         age_form = {
427                 'next_url':     '/',
428                 'action_confirm':   'Confirm',
429                 }
430         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
431         try:
432             self.report_age_confirmation()
433             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
436             return
437
438     def _extract_id(self, url):
439         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
440         if mobj is None:
441             self._downloader.report_error(u'invalid URL: %s' % url)
442             return
443         video_id = mobj.group(2)
444         return video_id
445
446     def _real_extract(self, url):
447         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448         mobj = re.search(self._NEXT_URL_RE, url)
449         if mobj:
450             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451         video_id = self._extract_id(url)
452
453         # Get video webpage
454         self.report_video_webpage_download(video_id)
455         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456         request = compat_urllib_request.Request(url)
457         try:
458             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
460             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
461             return
462
463         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
464
465         # Attempt to extract SWF player URL
466         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
467         if mobj is not None:
468             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
469         else:
470             player_url = None
471
472         # Get video info
473         self.report_video_info_webpage_download(video_id)
474         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
475             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
476                     % (video_id, el_type))
477             video_info_webpage = self._download_webpage(video_info_url, video_id,
478                                     note=False,
479                                     errnote='unable to download video info webpage')
480             video_info = compat_parse_qs(video_info_webpage)
481             if 'token' in video_info:
482                 break
483         if 'token' not in video_info:
484             if 'reason' in video_info:
485                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
486             else:
487                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
488             return
489
490         # Check for "rental" videos
491         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
492             self._downloader.report_error(u'"rental" videos not supported')
493             return
494
495         # Start extracting information
496         self.report_information_extraction(video_id)
497
498         # uploader
499         if 'author' not in video_info:
500             self._downloader.report_error(u'unable to extract uploader name')
501             return
502         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
503
504         # uploader_id
505         video_uploader_id = None
506         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
507         if mobj is not None:
508             video_uploader_id = mobj.group(1)
509         else:
510             self._downloader.report_warning(u'unable to extract uploader nickname')
511
512         # title
513         if 'title' not in video_info:
514             self._downloader.report_error(u'unable to extract video title')
515             return
516         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
517
518         # thumbnail image
519         if 'thumbnail_url' not in video_info:
520             self._downloader.report_warning(u'unable to extract video thumbnail')
521             video_thumbnail = ''
522         else:   # don't panic if we can't find it
523             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
524
525         # upload date
526         upload_date = None
527         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
528         if mobj is not None:
529             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531             for expression in format_expressions:
532                 try:
533                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
534                 except:
535                     pass
536
537         # description
538         video_description = get_element_by_id("eow-description", video_webpage)
539         if video_description:
540             video_description = clean_html(video_description)
541         else:
542             video_description = ''
543
544         # subtitles
545         video_subtitles = None
546
547         if self._downloader.params.get('writesubtitles', False):
548             video_subtitles = self._extract_subtitle(video_id)
549             if video_subtitles:
550                 (sub_error, sub_lang, sub) = video_subtitles[0]
551                 if sub_error:
552                     self._downloader.report_error(sub_error)
553
554         if self._downloader.params.get('allsubtitles', False):
555             video_subtitles = self._extract_all_subtitles(video_id)
556             for video_subtitle in video_subtitles:
557                 (sub_error, sub_lang, sub) = video_subtitle
558                 if sub_error:
559                     self._downloader.report_error(sub_error)
560
561         if self._downloader.params.get('listsubtitles', False):
562             sub_lang_list = self._list_available_subtitles(video_id)
563             return
564
565         if 'length_seconds' not in video_info:
566             self._downloader.report_warning(u'unable to extract video duration')
567             video_duration = ''
568         else:
569             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
570
571         # token
572         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
573
574         # Decide which formats to download
575         req_format = self._downloader.params.get('format', None)
576
577         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578             self.report_rtmp_download()
579             video_url_list = [(None, video_info['conn'][0])]
580         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
583             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
584             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
585
586             format_limit = self._downloader.params.get('format_limit', None)
587             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588             if format_limit is not None and format_limit in available_formats:
589                 format_list = available_formats[available_formats.index(format_limit):]
590             else:
591                 format_list = available_formats
592             existing_formats = [x for x in format_list if x in url_map]
593             if len(existing_formats) == 0:
594                 self._downloader.report_error(u'no known formats available for video')
595                 return
596             if self._downloader.params.get('listformats', None):
597                 self._print_formats(existing_formats)
598                 return
599             if req_format is None or req_format == 'best':
600                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601             elif req_format == 'worst':
602                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603             elif req_format in ('-1', 'all'):
604                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
605             else:
606                 # Specific formats. We pick the first in a slash-delimeted sequence.
607                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608                 req_formats = req_format.split('/')
609                 video_url_list = None
610                 for rf in req_formats:
611                     if rf in url_map:
612                         video_url_list = [(rf, url_map[rf])]
613                         break
614                 if video_url_list is None:
615                     self._downloader.report_error(u'requested format not available')
616                     return
617         else:
618             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
619             return
620
621         results = []
622         for format_param, video_real_url in video_url_list:
623             # Extension
624             video_extension = self._video_extensions.get(format_param, 'flv')
625
626             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627                                               self._video_dimensions.get(format_param, '???'))
628
629             results.append({
630                 'id':       video_id,
631                 'url':      video_real_url,
632                 'uploader': video_uploader,
633                 'uploader_id': video_uploader_id,
634                 'upload_date':  upload_date,
635                 'title':    video_title,
636                 'ext':      video_extension,
637                 'format':   video_format,
638                 'thumbnail':    video_thumbnail,
639                 'description':  video_description,
640                 'player_url':   player_url,
641                 'subtitles':    video_subtitles,
642                 'duration':     video_duration
643             })
644         return results
645
646
647 class MetacafeIE(InfoExtractor):
648     """Information Extractor for metacafe.com."""
649
650     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653     IE_NAME = u'metacafe'
654
655     def __init__(self, downloader=None):
656         InfoExtractor.__init__(self, downloader)
657
658     def report_disclaimer(self):
659         """Report disclaimer retrieval."""
660         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
661
662     def report_age_confirmation(self):
663         """Report attempt to confirm age."""
664         self._downloader.to_screen(u'[metacafe] Confirming age')
665
666     def report_download_webpage(self, video_id):
667         """Report webpage download."""
668         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
669
670     def report_extraction(self, video_id):
671         """Report information extraction."""
672         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
673
674     def _real_initialize(self):
675         # Retrieve disclaimer
676         request = compat_urllib_request.Request(self._DISCLAIMER)
677         try:
678             self.report_disclaimer()
679             disclaimer = compat_urllib_request.urlopen(request).read()
680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
682             return
683
684         # Confirm age
685         disclaimer_form = {
686             'filters': '0',
687             'submit': "Continue - I'm over 18",
688             }
689         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
690         try:
691             self.report_age_confirmation()
692             disclaimer = compat_urllib_request.urlopen(request).read()
693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
694             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
695             return
696
697     def _real_extract(self, url):
698         # Extract id and simplified title from URL
699         mobj = re.match(self._VALID_URL, url)
700         if mobj is None:
701             self._downloader.report_error(u'invalid URL: %s' % url)
702             return
703
704         video_id = mobj.group(1)
705
706         # Check if video comes from YouTube
707         mobj2 = re.match(r'^yt-(.*)$', video_id)
708         if mobj2 is not None:
709             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
710             return
711
712         # Retrieve video webpage to extract further information
713         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
714         try:
715             self.report_download_webpage(video_id)
716             webpage = compat_urllib_request.urlopen(request).read()
717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
719             return
720
721         # Extract URL, uploader and title from webpage
722         self.report_extraction(video_id)
723         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
724         if mobj is not None:
725             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726             video_extension = mediaURL[-3:]
727
728             # Extract gdaKey if available
729             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
730             if mobj is None:
731                 video_url = mediaURL
732             else:
733                 gdaKey = mobj.group(1)
734                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
735         else:
736             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
737             if mobj is None:
738                 self._downloader.report_error(u'unable to extract media URL')
739                 return
740             vardict = compat_parse_qs(mobj.group(1))
741             if 'mediaData' not in vardict:
742                 self._downloader.report_error(u'unable to extract media URL')
743                 return
744             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
745             if mobj is None:
746                 self._downloader.report_error(u'unable to extract media URL')
747                 return
748             mediaURL = mobj.group(1).replace('\\/', '/')
749             video_extension = mediaURL[-3:]
750             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
751
752         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
753         if mobj is None:
754             self._downloader.report_error(u'unable to extract title')
755             return
756         video_title = mobj.group(1).decode('utf-8')
757
758         mobj = re.search(r'submitter=(.*?);', webpage)
759         if mobj is None:
760             self._downloader.report_error(u'unable to extract uploader nickname')
761             return
762         video_uploader = mobj.group(1)
763
764         return [{
765             'id':       video_id.decode('utf-8'),
766             'url':      video_url.decode('utf-8'),
767             'uploader': video_uploader.decode('utf-8'),
768             'upload_date':  None,
769             'title':    video_title,
770             'ext':      video_extension.decode('utf-8'),
771         }]
772
773
774 class DailymotionIE(InfoExtractor):
775     """Information Extractor for Dailymotion"""
776
777     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778     IE_NAME = u'dailymotion'
779     _WORKING = False
780
781     def __init__(self, downloader=None):
782         InfoExtractor.__init__(self, downloader)
783
784     def report_extraction(self, video_id):
785         """Report information extraction."""
786         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
787
788     def _real_extract(self, url):
789         # Extract id and simplified title from URL
790         mobj = re.match(self._VALID_URL, url)
791         if mobj is None:
792             self._downloader.report_error(u'invalid URL: %s' % url)
793             return
794
795         video_id = mobj.group(1).split('_')[0].split('?')[0]
796
797         video_extension = 'mp4'
798
799         # Retrieve video webpage to extract further information
800         request = compat_urllib_request.Request(url)
801         request.add_header('Cookie', 'family_filter=off')
802         webpage = self._download_webpage(request, video_id)
803
804         # Extract URL, uploader and title from webpage
805         self.report_extraction(video_id)
806         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
807         if mobj is None:
808             self._downloader.report_error(u'unable to extract media URL')
809             return
810         flashvars = compat_urllib_parse.unquote(mobj.group(1))
811
812         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
813             if key in flashvars:
814                 max_quality = key
815                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
816                 break
817         else:
818             self._downloader.report_error(u'unable to extract video URL')
819             return
820
821         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
822         if mobj is None:
823             self._downloader.report_error(u'unable to extract video URL')
824             return
825
826         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
827
828         # TODO: support choosing qualities
829
830         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
831         if mobj is None:
832             self._downloader.report_error(u'unable to extract title')
833             return
834         video_title = unescapeHTML(mobj.group('title'))
835
836         video_uploader = None
837         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
838         if mobj is None:
839             # lookin for official user
840             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841             if mobj_official is None:
842                 self._downloader.report_warning(u'unable to extract uploader nickname')
843             else:
844                 video_uploader = mobj_official.group(1)
845         else:
846             video_uploader = mobj.group(1)
847
848         video_upload_date = None
849         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
850         if mobj is not None:
851             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
852
853         return [{
854             'id':       video_id,
855             'url':      video_url,
856             'uploader': video_uploader,
857             'upload_date':  video_upload_date,
858             'title':    video_title,
859             'ext':      video_extension,
860         }]
861
862
863 class PhotobucketIE(InfoExtractor):
864     """Information extractor for photobucket.com."""
865
866     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867     IE_NAME = u'photobucket'
868
869     def __init__(self, downloader=None):
870         InfoExtractor.__init__(self, downloader)
871
872     def report_download_webpage(self, video_id):
873         """Report webpage download."""
874         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
875
876     def report_extraction(self, video_id):
877         """Report information extraction."""
878         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
879
880     def _real_extract(self, url):
881         # Extract id from URL
882         mobj = re.match(self._VALID_URL, url)
883         if mobj is None:
884             self._downloader.report_error(u'Invalid URL: %s' % url)
885             return
886
887         video_id = mobj.group(1)
888
889         video_extension = 'flv'
890
891         # Retrieve video webpage to extract further information
892         request = compat_urllib_request.Request(url)
893         try:
894             self.report_download_webpage(video_id)
895             webpage = compat_urllib_request.urlopen(request).read()
896         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
898             return
899
900         # Extract URL, uploader, and title from webpage
901         self.report_extraction(video_id)
902         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
903         if mobj is None:
904             self._downloader.report_error(u'unable to extract media URL')
905             return
906         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
907
908         video_url = mediaURL
909
910         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
911         if mobj is None:
912             self._downloader.report_error(u'unable to extract title')
913             return
914         video_title = mobj.group(1).decode('utf-8')
915
916         video_uploader = mobj.group(2).decode('utf-8')
917
918         return [{
919             'id':       video_id.decode('utf-8'),
920             'url':      video_url.decode('utf-8'),
921             'uploader': video_uploader,
922             'upload_date':  None,
923             'title':    video_title,
924             'ext':      video_extension.decode('utf-8'),
925         }]
926
927
928 class YahooIE(InfoExtractor):
929     """Information extractor for video.yahoo.com."""
930
931     _WORKING = False
932     # _VALID_URL matches all Yahoo! Video URLs
933     # _VPAGE_URL matches only the extractable '/watch/' URLs
934     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936     IE_NAME = u'video.yahoo'
937
938     def __init__(self, downloader=None):
939         InfoExtractor.__init__(self, downloader)
940
941     def report_download_webpage(self, video_id):
942         """Report webpage download."""
943         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
944
945     def report_extraction(self, video_id):
946         """Report information extraction."""
947         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
948
949     def _real_extract(self, url, new_video=True):
950         # Extract ID from URL
951         mobj = re.match(self._VALID_URL, url)
952         if mobj is None:
953             self._downloader.report_error(u'Invalid URL: %s' % url)
954             return
955
956         video_id = mobj.group(2)
957         video_extension = 'flv'
958
959         # Rewrite valid but non-extractable URLs as
960         # extractable English language /watch/ URLs
961         if re.match(self._VPAGE_URL, url) is None:
962             request = compat_urllib_request.Request(url)
963             try:
964                 webpage = compat_urllib_request.urlopen(request).read()
965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
966                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
967                 return
968
969             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
970             if mobj is None:
971                 self._downloader.report_error(u'Unable to extract id field')
972                 return
973             yahoo_id = mobj.group(1)
974
975             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
976             if mobj is None:
977                 self._downloader.report_error(u'Unable to extract vid field')
978                 return
979             yahoo_vid = mobj.group(1)
980
981             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982             return self._real_extract(url, new_video=False)
983
984         # Retrieve video webpage to extract further information
985         request = compat_urllib_request.Request(url)
986         try:
987             self.report_download_webpage(video_id)
988             webpage = compat_urllib_request.urlopen(request).read()
989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
991             return
992
993         # Extract uploader and title from webpage
994         self.report_extraction(video_id)
995         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
996         if mobj is None:
997             self._downloader.report_error(u'unable to extract video title')
998             return
999         video_title = mobj.group(1).decode('utf-8')
1000
1001         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1002         if mobj is None:
1003             self._downloader.report_error(u'unable to extract video uploader')
1004             return
1005         video_uploader = mobj.group(1).decode('utf-8')
1006
1007         # Extract video thumbnail
1008         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1009         if mobj is None:
1010             self._downloader.report_error(u'unable to extract video thumbnail')
1011             return
1012         video_thumbnail = mobj.group(1).decode('utf-8')
1013
1014         # Extract video description
1015         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video description')
1018             return
1019         video_description = mobj.group(1).decode('utf-8')
1020         if not video_description:
1021             video_description = 'No description available.'
1022
1023         # Extract video height and width
1024         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video height')
1027             return
1028         yv_video_height = mobj.group(1)
1029
1030         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1031         if mobj is None:
1032             self._downloader.report_error(u'unable to extract video width')
1033             return
1034         yv_video_width = mobj.group(1)
1035
1036         # Retrieve video playlist to extract media URL
1037         # I'm not completely sure what all these options are, but we
1038         # seem to need most of them, otherwise the server sends a 401.
1039         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1040         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1041         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1044         try:
1045             self.report_download_webpage(video_id)
1046             webpage = compat_urllib_request.urlopen(request).read()
1047         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1049             return
1050
1051         # Extract media URL from playlist XML
1052         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1053         if mobj is None:
1054             self._downloader.report_error(u'Unable to extract media URL')
1055             return
1056         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057         video_url = unescapeHTML(video_url)
1058
1059         return [{
1060             'id':       video_id.decode('utf-8'),
1061             'url':      video_url,
1062             'uploader': video_uploader,
1063             'upload_date':  None,
1064             'title':    video_title,
1065             'ext':      video_extension.decode('utf-8'),
1066             'thumbnail':    video_thumbnail.decode('utf-8'),
1067             'description':  video_description,
1068         }]
1069
1070
1071 class VimeoIE(InfoExtractor):
1072     """Information extractor for vimeo.com."""
1073
1074     # _VALID_URL matches Vimeo URLs
1075     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1076     IE_NAME = u'vimeo'
1077
1078     def __init__(self, downloader=None):
1079         InfoExtractor.__init__(self, downloader)
1080
1081     def report_download_webpage(self, video_id):
1082         """Report webpage download."""
1083         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1084
1085     def report_extraction(self, video_id):
1086         """Report information extraction."""
1087         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1088
1089     def _real_extract(self, url, new_video=True):
1090         # Extract ID from URL
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             self._downloader.report_error(u'Invalid URL: %s' % url)
1094             return
1095
1096         video_id = mobj.group('id')
1097         if not mobj.group('proto'):
1098             url = 'https://' + url
1099         if mobj.group('direct_link'):
1100             url = 'https://vimeo.com/' + video_id
1101
1102         # Retrieve video webpage to extract further information
1103         request = compat_urllib_request.Request(url, None, std_headers)
1104         try:
1105             self.report_download_webpage(video_id)
1106             webpage_bytes = compat_urllib_request.urlopen(request).read()
1107             webpage = webpage_bytes.decode('utf-8')
1108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1110             return
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             self._downloader.report_error(u'unable to extract info section')
1123             return
1124
1125         # Extract title
1126         video_title = config["video"]["title"]
1127
1128         # Extract uploader and uploader_id
1129         video_uploader = config["video"]["owner"]["name"]
1130         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1131
1132         # Extract video thumbnail
1133         video_thumbnail = config["video"]["thumbnail"]
1134
1135         # Extract video description
1136         video_description = get_element_by_attribute("itemprop", "description", webpage)
1137         if video_description: video_description = clean_html(video_description)
1138         else: video_description = u''
1139
1140         # Extract upload date
1141         video_upload_date = None
1142         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143         if mobj is not None:
1144             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145
1146         # Vimeo specific: extract request signature and timestamp
1147         sig = config['request']['signature']
1148         timestamp = config['request']['timestamp']
1149
1150         # Vimeo specific: extract video codec and quality information
1151         # First consider quality, then codecs, then take everything
1152         # TODO bind to format param
1153         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154         files = { 'hd': [], 'sd': [], 'other': []}
1155         for codec_name, codec_extension in codecs:
1156             if codec_name in config["video"]["files"]:
1157                 if 'hd' in config["video"]["files"][codec_name]:
1158                     files['hd'].append((codec_name, codec_extension, 'hd'))
1159                 elif 'sd' in config["video"]["files"][codec_name]:
1160                     files['sd'].append((codec_name, codec_extension, 'sd'))
1161                 else:
1162                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163
1164         for quality in ('hd', 'sd', 'other'):
1165             if len(files[quality]) > 0:
1166                 video_quality = files[quality][0][2]
1167                 video_codec = files[quality][0][0]
1168                 video_extension = files[quality][0][1]
1169                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170                 break
1171         else:
1172             self._downloader.report_error(u'no known codec found')
1173             return
1174
1175         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1177
1178         return [{
1179             'id':       video_id,
1180             'url':      video_url,
1181             'uploader': video_uploader,
1182             'uploader_id': video_uploader_id,
1183             'upload_date':  video_upload_date,
1184             'title':    video_title,
1185             'ext':      video_extension,
1186             'thumbnail':    video_thumbnail,
1187             'description':  video_description,
1188         }]
1189
1190
1191 class ArteTvIE(InfoExtractor):
1192     """arte.tv information extractor."""
1193
1194     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195     _LIVE_URL = r'index-[0-9]+\.html$'
1196
1197     IE_NAME = u'arte.tv'
1198
1199     def __init__(self, downloader=None):
1200         InfoExtractor.__init__(self, downloader)
1201
1202     def report_download_webpage(self, video_id):
1203         """Report webpage download."""
1204         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1205
1206     def report_extraction(self, video_id):
1207         """Report information extraction."""
1208         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1209
1210     def fetch_webpage(self, url):
1211         request = compat_urllib_request.Request(url)
1212         try:
1213             self.report_download_webpage(url)
1214             webpage = compat_urllib_request.urlopen(request).read()
1215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1217             return
1218         except ValueError as err:
1219             self._downloader.report_error(u'Invalid URL: %s' % url)
1220             return
1221         return webpage
1222
1223     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224         page = self.fetch_webpage(url)
1225         mobj = re.search(regex, page, regexFlags)
1226         info = {}
1227
1228         if mobj is None:
1229             self._downloader.report_error(u'Invalid URL: %s' % url)
1230             return
1231
1232         for (i, key, err) in matchTuples:
1233             if mobj.group(i) is None:
1234                 self._downloader.trouble(err)
1235                 return
1236             else:
1237                 info[key] = mobj.group(i)
1238
1239         return info
1240
1241     def extractLiveStream(self, url):
1242         video_lang = url.split('/')[-4]
1243         info = self.grep_webpage(
1244             url,
1245             r'src="(.*?/videothek_js.*?\.js)',
1246             0,
1247             [
1248                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1249             ]
1250         )
1251         http_host = url.split('/')[2]
1252         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253         info = self.grep_webpage(
1254             next_url,
1255             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256                 '(http://.*?\.swf).*?' +
1257                 '(rtmp://.*?)\'',
1258             re.DOTALL,
1259             [
1260                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1261                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1263             ]
1264         )
1265         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1266
1267     def extractPlus7Stream(self, url):
1268         video_lang = url.split('/')[-3]
1269         info = self.grep_webpage(
1270             url,
1271             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1272             0,
1273             [
1274                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1275             ]
1276         )
1277         next_url = compat_urllib_parse.unquote(info.get('url'))
1278         info = self.grep_webpage(
1279             next_url,
1280             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1281             0,
1282             [
1283                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1284             ]
1285         )
1286         next_url = compat_urllib_parse.unquote(info.get('url'))
1287
1288         info = self.grep_webpage(
1289             next_url,
1290             r'<video id="(.*?)".*?>.*?' +
1291                 '<name>(.*?)</name>.*?' +
1292                 '<dateVideo>(.*?)</dateVideo>.*?' +
1293                 '<url quality="hd">(.*?)</url>',
1294             re.DOTALL,
1295             [
1296                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1297                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1299                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1300             ]
1301         )
1302
1303         return {
1304             'id':           info.get('id'),
1305             'url':          compat_urllib_parse.unquote(info.get('url')),
1306             'uploader':     u'arte.tv',
1307             'upload_date':  info.get('date'),
1308             'title':        info.get('title').decode('utf-8'),
1309             'ext':          u'mp4',
1310             'format':       u'NA',
1311             'player_url':   None,
1312         }
1313
1314     def _real_extract(self, url):
1315         video_id = url.split('/')[-1]
1316         self.report_extraction(video_id)
1317
1318         if re.search(self._LIVE_URL, video_id) is not None:
1319             self.extractLiveStream(url)
1320             return
1321         else:
1322             info = self.extractPlus7Stream(url)
1323
1324         return [info]
1325
1326
1327 class GenericIE(InfoExtractor):
1328     """Generic last-resort information extractor."""
1329
1330     _VALID_URL = r'.*'
1331     IE_NAME = u'generic'
1332
1333     def __init__(self, downloader=None):
1334         InfoExtractor.__init__(self, downloader)
1335
1336     def report_download_webpage(self, video_id):
1337         """Report webpage download."""
1338         if not self._downloader.params.get('test', False):
1339             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1341
1342     def report_extraction(self, video_id):
1343         """Report information extraction."""
1344         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1345
1346     def report_following_redirect(self, new_url):
1347         """Report information extraction."""
1348         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1349
1350     def _test_redirect(self, url):
1351         """Check if it is a redirect, like url shorteners, in case restart chain."""
1352         class HeadRequest(compat_urllib_request.Request):
1353             def get_method(self):
1354                 return "HEAD"
1355
1356         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1357             """
1358             Subclass the HTTPRedirectHandler to make it use our
1359             HeadRequest also on the redirected URL
1360             """
1361             def redirect_request(self, req, fp, code, msg, headers, newurl):
1362                 if code in (301, 302, 303, 307):
1363                     newurl = newurl.replace(' ', '%20')
1364                     newheaders = dict((k,v) for k,v in req.headers.items()
1365                                       if k.lower() not in ("content-length", "content-type"))
1366                     return HeadRequest(newurl,
1367                                        headers=newheaders,
1368                                        origin_req_host=req.get_origin_req_host(),
1369                                        unverifiable=True)
1370                 else:
1371                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1372
1373         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1374             """
1375             Fallback to GET if HEAD is not allowed (405 HTTP error)
1376             """
1377             def http_error_405(self, req, fp, code, msg, headers):
1378                 fp.read()
1379                 fp.close()
1380
1381                 newheaders = dict((k,v) for k,v in req.headers.items()
1382                                   if k.lower() not in ("content-length", "content-type"))
1383                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1384                                                  headers=newheaders,
1385                                                  origin_req_host=req.get_origin_req_host(),
1386                                                  unverifiable=True))
1387
1388         # Build our opener
1389         opener = compat_urllib_request.OpenerDirector()
1390         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391                         HTTPMethodFallback, HEADRedirectHandler,
1392                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393             opener.add_handler(handler())
1394
1395         response = opener.open(HeadRequest(url))
1396         new_url = response.geturl()
1397
1398         if url == new_url:
1399             return False
1400
1401         self.report_following_redirect(new_url)
1402         self._downloader.download([new_url])
1403         return True
1404
1405     def _real_extract(self, url):
1406         if self._test_redirect(url): return
1407
1408         video_id = url.split('/')[-1]
1409         try:
1410             webpage = self._download_webpage(url, video_id)
1411         except ValueError as err:
1412             # since this is the last-resort InfoExtractor, if
1413             # this error is thrown, it'll be thrown here
1414             self._downloader.report_error(u'Invalid URL: %s' % url)
1415             return
1416
1417         self.report_extraction(video_id)
1418         # Start with something easy: JW Player in SWFObject
1419         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1420         if mobj is None:
1421             # Broaden the search a little bit
1422             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1423         if mobj is None:
1424             # Broaden the search a little bit: JWPlayer JS loader
1425             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1426         if mobj is None:
1427             self._downloader.report_error(u'Invalid URL: %s' % url)
1428             return
1429
1430         # It's possible that one of the regexes
1431         # matched, but returned an empty group:
1432         if mobj.group(1) is None:
1433             self._downloader.report_error(u'Invalid URL: %s' % url)
1434             return
1435
1436         video_url = compat_urllib_parse.unquote(mobj.group(1))
1437         video_id = os.path.basename(video_url)
1438
1439         # here's a fun little line of code for you:
1440         video_extension = os.path.splitext(video_id)[1][1:]
1441         video_id = os.path.splitext(video_id)[0]
1442
1443         # it's tempting to parse this further, but you would
1444         # have to take into account all the variations like
1445         #   Video Title - Site Name
1446         #   Site Name | Video Title
1447         #   Video Title - Tagline | Site Name
1448         # and so on and so forth; it's just not practical
1449         mobj = re.search(r'<title>(.*)</title>', webpage)
1450         if mobj is None:
1451             self._downloader.report_error(u'unable to extract title')
1452             return
1453         video_title = mobj.group(1)
1454
1455         # video uploader is domain name
1456         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1457         if mobj is None:
1458             self._downloader.report_error(u'unable to extract title')
1459             return
1460         video_uploader = mobj.group(1)
1461
1462         return [{
1463             'id':       video_id,
1464             'url':      video_url,
1465             'uploader': video_uploader,
1466             'upload_date':  None,
1467             'title':    video_title,
1468             'ext':      video_extension,
1469         }]
1470
1471
1472 class YoutubeSearchIE(InfoExtractor):
1473     """Information Extractor for YouTube search queries."""
1474     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476     _max_youtube_results = 1000
1477     IE_NAME = u'youtube:search'
1478
1479     def __init__(self, downloader=None):
1480         InfoExtractor.__init__(self, downloader)
1481
1482     def report_download_page(self, query, pagenum):
1483         """Report attempt to download search page with given number."""
1484         query = query.decode(preferredencoding())
1485         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1486
1487     def _real_extract(self, query):
1488         mobj = re.match(self._VALID_URL, query)
1489         if mobj is None:
1490             self._downloader.report_error(u'invalid search query "%s"' % query)
1491             return
1492
1493         prefix, query = query.split(':')
1494         prefix = prefix[8:]
1495         query = query.encode('utf-8')
1496         if prefix == '':
1497             self._download_n_results(query, 1)
1498             return
1499         elif prefix == 'all':
1500             self._download_n_results(query, self._max_youtube_results)
1501             return
1502         else:
1503             try:
1504                 n = int(prefix)
1505                 if n <= 0:
1506                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1507                     return
1508                 elif n > self._max_youtube_results:
1509                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510                     n = self._max_youtube_results
1511                 self._download_n_results(query, n)
1512                 return
1513             except ValueError: # parsing prefix as integer fails
1514                 self._download_n_results(query, 1)
1515                 return
1516
1517     def _download_n_results(self, query, n):
1518         """Downloads a specified number of results for a query"""
1519
1520         video_ids = []
1521         pagenum = 0
1522         limit = n
1523
1524         while (50 * pagenum) < limit:
1525             self.report_download_page(query, pagenum+1)
1526             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527             request = compat_urllib_request.Request(result_url)
1528             try:
1529                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1532                 return
1533             api_response = json.loads(data)['data']
1534
1535             if not 'items' in api_response:
1536                 self._downloader.trouble(u'[youtube] No video results')
1537                 return
1538
1539             new_ids = list(video['id'] for video in api_response['items'])
1540             video_ids += new_ids
1541
1542             limit = min(n, api_response['totalItems'])
1543             pagenum += 1
1544
1545         if len(video_ids) > n:
1546             video_ids = video_ids[:n]
1547         for id in video_ids:
1548             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1549         return
1550
1551
1552 class GoogleSearchIE(InfoExtractor):
1553     """Information Extractor for Google Video search queries."""
1554     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558     _max_google_results = 1000
1559     IE_NAME = u'video.google:search'
1560
1561     def __init__(self, downloader=None):
1562         InfoExtractor.__init__(self, downloader)
1563
1564     def report_download_page(self, query, pagenum):
1565         """Report attempt to download playlist page with given number."""
1566         query = query.decode(preferredencoding())
1567         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1568
1569     def _real_extract(self, query):
1570         mobj = re.match(self._VALID_URL, query)
1571         if mobj is None:
1572             self._downloader.report_error(u'invalid search query "%s"' % query)
1573             return
1574
1575         prefix, query = query.split(':')
1576         prefix = prefix[8:]
1577         query = query.encode('utf-8')
1578         if prefix == '':
1579             self._download_n_results(query, 1)
1580             return
1581         elif prefix == 'all':
1582             self._download_n_results(query, self._max_google_results)
1583             return
1584         else:
1585             try:
1586                 n = int(prefix)
1587                 if n <= 0:
1588                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1589                     return
1590                 elif n > self._max_google_results:
1591                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592                     n = self._max_google_results
1593                 self._download_n_results(query, n)
1594                 return
1595             except ValueError: # parsing prefix as integer fails
1596                 self._download_n_results(query, 1)
1597                 return
1598
1599     def _download_n_results(self, query, n):
1600         """Downloads a specified number of results for a query"""
1601
1602         video_ids = []
1603         pagenum = 0
1604
1605         while True:
1606             self.report_download_page(query, pagenum)
1607             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608             request = compat_urllib_request.Request(result_url)
1609             try:
1610                 page = compat_urllib_request.urlopen(request).read()
1611             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1613                 return
1614
1615             # Extract video identifiers
1616             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617                 video_id = mobj.group(1)
1618                 if video_id not in video_ids:
1619                     video_ids.append(video_id)
1620                     if len(video_ids) == n:
1621                         # Specified n videos reached
1622                         for id in video_ids:
1623                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624                         return
1625
1626             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627                 for id in video_ids:
1628                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1629                 return
1630
1631             pagenum = pagenum + 1
1632
1633
1634 class YahooSearchIE(InfoExtractor):
1635     """Information Extractor for Yahoo! Video search queries."""
1636
1637     _WORKING = False
1638     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641     _MORE_PAGES_INDICATOR = r'\s*Next'
1642     _max_yahoo_results = 1000
1643     IE_NAME = u'video.yahoo:search'
1644
1645     def __init__(self, downloader=None):
1646         InfoExtractor.__init__(self, downloader)
1647
1648     def report_download_page(self, query, pagenum):
1649         """Report attempt to download playlist page with given number."""
1650         query = query.decode(preferredencoding())
1651         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1652
1653     def _real_extract(self, query):
1654         mobj = re.match(self._VALID_URL, query)
1655         if mobj is None:
1656             self._downloader.report_error(u'invalid search query "%s"' % query)
1657             return
1658
1659         prefix, query = query.split(':')
1660         prefix = prefix[8:]
1661         query = query.encode('utf-8')
1662         if prefix == '':
1663             self._download_n_results(query, 1)
1664             return
1665         elif prefix == 'all':
1666             self._download_n_results(query, self._max_yahoo_results)
1667             return
1668         else:
1669             try:
1670                 n = int(prefix)
1671                 if n <= 0:
1672                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1673                     return
1674                 elif n > self._max_yahoo_results:
1675                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676                     n = self._max_yahoo_results
1677                 self._download_n_results(query, n)
1678                 return
1679             except ValueError: # parsing prefix as integer fails
1680                 self._download_n_results(query, 1)
1681                 return
1682
1683     def _download_n_results(self, query, n):
1684         """Downloads a specified number of results for a query"""
1685
1686         video_ids = []
1687         already_seen = set()
1688         pagenum = 1
1689
1690         while True:
1691             self.report_download_page(query, pagenum)
1692             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693             request = compat_urllib_request.Request(result_url)
1694             try:
1695                 page = compat_urllib_request.urlopen(request).read()
1696             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1698                 return
1699
1700             # Extract video identifiers
1701             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702                 video_id = mobj.group(1)
1703                 if video_id not in already_seen:
1704                     video_ids.append(video_id)
1705                     already_seen.add(video_id)
1706                     if len(video_ids) == n:
1707                         # Specified n videos reached
1708                         for id in video_ids:
1709                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710                         return
1711
1712             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713                 for id in video_ids:
1714                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1715                 return
1716
1717             pagenum = pagenum + 1
1718
1719
1720 class YoutubePlaylistIE(InfoExtractor):
1721     """Information Extractor for YouTube playlists."""
1722
1723     _VALID_URL = r"""(?:
1724                         (?:https?://)?
1725                         (?:\w+\.)?
1726                         youtube\.com/
1727                         (?:
1728                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729                            \? (?:.*?&)*? (?:p|a|list)=
1730                         |  p/
1731                         )
1732                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1733                         .*
1734                      |
1735                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1736                      )"""
1737     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1738     _MAX_RESULTS = 50
1739     IE_NAME = u'youtube:playlist'
1740
1741     def __init__(self, downloader=None):
1742         InfoExtractor.__init__(self, downloader)
1743
1744     @classmethod
1745     def suitable(cls, url):
1746         """Receives a URL and returns True if suitable for this IE."""
1747         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1748
1749     def report_download_page(self, playlist_id, pagenum):
1750         """Report attempt to download playlist page with given number."""
1751         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1752
1753     def _real_extract(self, url):
1754         # Extract playlist id
1755         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1756         if mobj is None:
1757             self._downloader.report_error(u'invalid url: %s' % url)
1758             return
1759
1760         # Download playlist videos from API
1761         playlist_id = mobj.group(1) or mobj.group(2)
1762         page_num = 1
1763         videos = []
1764
1765         while True:
1766             self.report_download_page(playlist_id, page_num)
1767
1768             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1769             try:
1770                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1773                 return
1774
1775             try:
1776                 response = json.loads(page)
1777             except ValueError as err:
1778                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1779                 return
1780
1781             if 'feed' not in response:
1782                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1783                 return
1784             if 'entry' not in response['feed']:
1785                 # Number of videos is a multiple of self._MAX_RESULTS
1786                 break
1787
1788             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1789                         for entry in response['feed']['entry']
1790                         if 'content' in entry ]
1791
1792             if len(response['feed']['entry']) < self._MAX_RESULTS:
1793                 break
1794             page_num += 1
1795
1796         videos = [v[1] for v in sorted(videos)]
1797         total = len(videos)
1798
1799         playliststart = self._downloader.params.get('playliststart', 1) - 1
1800         playlistend = self._downloader.params.get('playlistend', -1)
1801         if playlistend == -1:
1802             videos = videos[playliststart:]
1803         else:
1804             videos = videos[playliststart:playlistend]
1805
1806         if len(videos) == total:
1807             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1808         else:
1809             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1810
1811         for video in videos:
1812             self._downloader.download([video])
1813         return
1814
1815
1816 class YoutubeChannelIE(InfoExtractor):
1817     """Information Extractor for YouTube channels."""
1818
1819     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1820     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1821     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1822     IE_NAME = u'youtube:channel'
1823
1824     def report_download_page(self, channel_id, pagenum):
1825         """Report attempt to download channel page with given number."""
1826         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1827
1828     def _real_extract(self, url):
1829         # Extract channel id
1830         mobj = re.match(self._VALID_URL, url)
1831         if mobj is None:
1832             self._downloader.report_error(u'invalid url: %s' % url)
1833             return
1834
1835         # Download channel pages
1836         channel_id = mobj.group(1)
1837         video_ids = []
1838         pagenum = 1
1839
1840         while True:
1841             self.report_download_page(channel_id, pagenum)
1842             url = self._TEMPLATE_URL % (channel_id, pagenum)
1843             request = compat_urllib_request.Request(url)
1844             try:
1845                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1846             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1847                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1848                 return
1849
1850             # Extract video identifiers
1851             ids_in_page = []
1852             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1853                 if mobj.group(1) not in ids_in_page:
1854                     ids_in_page.append(mobj.group(1))
1855             video_ids.extend(ids_in_page)
1856
1857             if self._MORE_PAGES_INDICATOR not in page:
1858                 break
1859             pagenum = pagenum + 1
1860
1861         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1862
1863         for id in video_ids:
1864             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1865         return
1866
1867
1868 class YoutubeUserIE(InfoExtractor):
1869     """Information Extractor for YouTube users."""
1870
1871     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1872     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1873     _GDATA_PAGE_SIZE = 50
1874     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1875     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1876     IE_NAME = u'youtube:user'
1877
1878     def __init__(self, downloader=None):
1879         InfoExtractor.__init__(self, downloader)
1880
1881     def report_download_page(self, username, start_index):
1882         """Report attempt to download user page."""
1883         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1884                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1885
1886     def _real_extract(self, url):
1887         # Extract username
1888         mobj = re.match(self._VALID_URL, url)
1889         if mobj is None:
1890             self._downloader.report_error(u'invalid url: %s' % url)
1891             return
1892
1893         username = mobj.group(1)
1894
1895         # Download video ids using YouTube Data API. Result size per
1896         # query is limited (currently to 50 videos) so we need to query
1897         # page by page until there are no video ids - it means we got
1898         # all of them.
1899
1900         video_ids = []
1901         pagenum = 0
1902
1903         while True:
1904             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1905             self.report_download_page(username, start_index)
1906
1907             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1908
1909             try:
1910                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1912                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1913                 return
1914
1915             # Extract video identifiers
1916             ids_in_page = []
1917
1918             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1919                 if mobj.group(1) not in ids_in_page:
1920                     ids_in_page.append(mobj.group(1))
1921
1922             video_ids.extend(ids_in_page)
1923
1924             # A little optimization - if current page is not
1925             # "full", ie. does not contain PAGE_SIZE video ids then
1926             # we can assume that this page is the last one - there
1927             # are no more ids on further pages - no need to query
1928             # again.
1929
1930             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1931                 break
1932
1933             pagenum += 1
1934
1935         all_ids_count = len(video_ids)
1936         playliststart = self._downloader.params.get('playliststart', 1) - 1
1937         playlistend = self._downloader.params.get('playlistend', -1)
1938
1939         if playlistend == -1:
1940             video_ids = video_ids[playliststart:]
1941         else:
1942             video_ids = video_ids[playliststart:playlistend]
1943
1944         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1945                 (username, all_ids_count, len(video_ids)))
1946
1947         for video_id in video_ids:
1948             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1949
1950
1951 class BlipTVUserIE(InfoExtractor):
1952     """Information Extractor for blip.tv users."""
1953
1954     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1955     _PAGE_SIZE = 12
1956     IE_NAME = u'blip.tv:user'
1957
1958     def __init__(self, downloader=None):
1959         InfoExtractor.__init__(self, downloader)
1960
1961     def report_download_page(self, username, pagenum):
1962         """Report attempt to download user page."""
1963         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1964                 (self.IE_NAME, username, pagenum))
1965
1966     def _real_extract(self, url):
1967         # Extract username
1968         mobj = re.match(self._VALID_URL, url)
1969         if mobj is None:
1970             self._downloader.report_error(u'invalid url: %s' % url)
1971             return
1972
1973         username = mobj.group(1)
1974
1975         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1976
1977         request = compat_urllib_request.Request(url)
1978
1979         try:
1980             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1981             mobj = re.search(r'data-users-id="([^"]+)"', page)
1982             page_base = page_base % mobj.group(1)
1983         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1984             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1985             return
1986
1987
1988         # Download video ids using BlipTV Ajax calls. Result size per
1989         # query is limited (currently to 12 videos) so we need to query
1990         # page by page until there are no video ids - it means we got
1991         # all of them.
1992
1993         video_ids = []
1994         pagenum = 1
1995
1996         while True:
1997             self.report_download_page(username, pagenum)
1998             url = page_base + "&page=" + str(pagenum)
1999             request = compat_urllib_request.Request( url )
2000             try:
2001                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2002             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2003                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2004                 return
2005
2006             # Extract video identifiers
2007             ids_in_page = []
2008
2009             for mobj in re.finditer(r'href="/([^"]+)"', page):
2010                 if mobj.group(1) not in ids_in_page:
2011                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2012
2013             video_ids.extend(ids_in_page)
2014
2015             # A little optimization - if current page is not
2016             # "full", ie. does not contain PAGE_SIZE video ids then
2017             # we can assume that this page is the last one - there
2018             # are no more ids on further pages - no need to query
2019             # again.
2020
2021             if len(ids_in_page) < self._PAGE_SIZE:
2022                 break
2023
2024             pagenum += 1
2025
2026         all_ids_count = len(video_ids)
2027         playliststart = self._downloader.params.get('playliststart', 1) - 1
2028         playlistend = self._downloader.params.get('playlistend', -1)
2029
2030         if playlistend == -1:
2031             video_ids = video_ids[playliststart:]
2032         else:
2033             video_ids = video_ids[playliststart:playlistend]
2034
2035         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2036                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2037
2038         for video_id in video_ids:
2039             self._downloader.download([u'http://blip.tv/'+video_id])
2040
2041
2042 class DepositFilesIE(InfoExtractor):
2043     """Information extractor for depositfiles.com"""
2044
2045     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2046
2047     def report_download_webpage(self, file_id):
2048         """Report webpage download."""
2049         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2050
2051     def report_extraction(self, file_id):
2052         """Report information extraction."""
2053         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2054
2055     def _real_extract(self, url):
2056         file_id = url.split('/')[-1]
2057         # Rebuild url in english locale
2058         url = 'http://depositfiles.com/en/files/' + file_id
2059
2060         # Retrieve file webpage with 'Free download' button pressed
2061         free_download_indication = { 'gateway_result' : '1' }
2062         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2063         try:
2064             self.report_download_webpage(file_id)
2065             webpage = compat_urllib_request.urlopen(request).read()
2066         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2067             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2068             return
2069
2070         # Search for the real file URL
2071         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2072         if (mobj is None) or (mobj.group(1) is None):
2073             # Try to figure out reason of the error.
2074             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2075             if (mobj is not None) and (mobj.group(1) is not None):
2076                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2077                 self._downloader.report_error(u'%s' % restriction_message)
2078             else:
2079                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2080             return
2081
2082         file_url = mobj.group(1)
2083         file_extension = os.path.splitext(file_url)[1][1:]
2084
2085         # Search for file title
2086         mobj = re.search(r'<b title="(.*?)">', webpage)
2087         if mobj is None:
2088             self._downloader.report_error(u'unable to extract title')
2089             return
2090         file_title = mobj.group(1).decode('utf-8')
2091
2092         return [{
2093             'id':       file_id.decode('utf-8'),
2094             'url':      file_url.decode('utf-8'),
2095             'uploader': None,
2096             'upload_date':  None,
2097             'title':    file_title,
2098             'ext':      file_extension.decode('utf-8'),
2099         }]
2100
2101
2102 class FacebookIE(InfoExtractor):
2103     """Information Extractor for Facebook"""
2104
2105     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2106     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2107     _NETRC_MACHINE = 'facebook'
2108     IE_NAME = u'facebook'
2109
2110     def report_login(self):
2111         """Report attempt to log in."""
2112         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2113
2114     def _real_initialize(self):
2115         if self._downloader is None:
2116             return
2117
2118         useremail = None
2119         password = None
2120         downloader_params = self._downloader.params
2121
2122         # Attempt to use provided username and password or .netrc data
2123         if downloader_params.get('username', None) is not None:
2124             useremail = downloader_params['username']
2125             password = downloader_params['password']
2126         elif downloader_params.get('usenetrc', False):
2127             try:
2128                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2129                 if info is not None:
2130                     useremail = info[0]
2131                     password = info[2]
2132                 else:
2133                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2134             except (IOError, netrc.NetrcParseError) as err:
2135                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2136                 return
2137
2138         if useremail is None:
2139             return
2140
2141         # Log in
2142         login_form = {
2143             'email': useremail,
2144             'pass': password,
2145             'login': 'Log+In'
2146             }
2147         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2148         try:
2149             self.report_login()
2150             login_results = compat_urllib_request.urlopen(request).read()
2151             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2152                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2153                 return
2154         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2155             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2156             return
2157
2158     def _real_extract(self, url):
2159         mobj = re.match(self._VALID_URL, url)
2160         if mobj is None:
2161             self._downloader.report_error(u'invalid URL: %s' % url)
2162             return
2163         video_id = mobj.group('ID')
2164
2165         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2166         webpage = self._download_webpage(url, video_id)
2167
2168         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2169         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2170         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2171         if not m:
2172             raise ExtractorError(u'Cannot parse data')
2173         data = dict(json.loads(m.group(1)))
2174         params_raw = compat_urllib_parse.unquote(data['params'])
2175         params = json.loads(params_raw)
2176         video_data = params['video_data'][0]
2177         video_url = video_data.get('hd_src')
2178         if not video_url:
2179             video_url = video_data['sd_src']
2180         if not video_url:
2181             raise ExtractorError(u'Cannot find video URL')
2182         video_duration = int(video_data['video_duration'])
2183         thumbnail = video_data['thumbnail_src']
2184
2185         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2186         if not m:
2187             raise ExtractorError(u'Cannot find title in webpage')
2188         video_title = unescapeHTML(m.group(1))
2189
2190         info = {
2191             'id': video_id,
2192             'title': video_title,
2193             'url': video_url,
2194             'ext': 'mp4',
2195             'duration': video_duration,
2196             'thumbnail': thumbnail,
2197         }
2198         return [info]
2199
2200
2201 class BlipTVIE(InfoExtractor):
2202     """Information extractor for blip.tv"""
2203
2204     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2205     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2206     IE_NAME = u'blip.tv'
2207
2208     def report_extraction(self, file_id):
2209         """Report information extraction."""
2210         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2211
2212     def report_direct_download(self, title):
2213         """Report information extraction."""
2214         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2215
2216     def _real_extract(self, url):
2217         mobj = re.match(self._VALID_URL, url)
2218         if mobj is None:
2219             self._downloader.report_error(u'invalid URL: %s' % url)
2220             return
2221
2222         urlp = compat_urllib_parse_urlparse(url)
2223         if urlp.path.startswith('/play/'):
2224             request = compat_urllib_request.Request(url)
2225             response = compat_urllib_request.urlopen(request)
2226             redirecturl = response.geturl()
2227             rurlp = compat_urllib_parse_urlparse(redirecturl)
2228             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2229             url = 'http://blip.tv/a/a-' + file_id
2230             return self._real_extract(url)
2231
2232
2233         if '?' in url:
2234             cchar = '&'
2235         else:
2236             cchar = '?'
2237         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2238         request = compat_urllib_request.Request(json_url)
2239         request.add_header('User-Agent', 'iTunes/10.6.1')
2240         self.report_extraction(mobj.group(1))
2241         info = None
2242         try:
2243             urlh = compat_urllib_request.urlopen(request)
2244             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2245                 basename = url.split('/')[-1]
2246                 title,ext = os.path.splitext(basename)
2247                 title = title.decode('UTF-8')
2248                 ext = ext.replace('.', '')
2249                 self.report_direct_download(title)
2250                 info = {
2251                     'id': title,
2252                     'url': url,
2253                     'uploader': None,
2254                     'upload_date': None,
2255                     'title': title,
2256                     'ext': ext,
2257                     'urlhandle': urlh
2258                 }
2259         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2260             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2261         if info is None: # Regular URL
2262             try:
2263                 json_code_bytes = urlh.read()
2264                 json_code = json_code_bytes.decode('utf-8')
2265             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2266                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2267                 return
2268
2269             try:
2270                 json_data = json.loads(json_code)
2271                 if 'Post' in json_data:
2272                     data = json_data['Post']
2273                 else:
2274                     data = json_data
2275
2276                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2277                 video_url = data['media']['url']
2278                 umobj = re.match(self._URL_EXT, video_url)
2279                 if umobj is None:
2280                     raise ValueError('Can not determine filename extension')
2281                 ext = umobj.group(1)
2282
2283                 info = {
2284                     'id': data['item_id'],
2285                     'url': video_url,
2286                     'uploader': data['display_name'],
2287                     'upload_date': upload_date,
2288                     'title': data['title'],
2289                     'ext': ext,
2290                     'format': data['media']['mimeType'],
2291                     'thumbnail': data['thumbnailUrl'],
2292                     'description': data['description'],
2293                     'player_url': data['embedUrl'],
2294                     'user_agent': 'iTunes/10.6.1',
2295                 }
2296             except (ValueError,KeyError) as err:
2297                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2298                 return
2299
2300         return [info]
2301
2302
2303 class MyVideoIE(InfoExtractor):
2304     """Information Extractor for myvideo.de."""
2305
2306     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2307     IE_NAME = u'myvideo'
2308
2309     def __init__(self, downloader=None):
2310         InfoExtractor.__init__(self, downloader)
2311
2312     def report_extraction(self, video_id):
2313         """Report information extraction."""
2314         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2315
2316     def _real_extract(self,url):
2317         mobj = re.match(self._VALID_URL, url)
2318         if mobj is None:
2319             self._download.report_error(u'invalid URL: %s' % url)
2320             return
2321
2322         video_id = mobj.group(1)
2323
2324         # Get video webpage
2325         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2326         webpage = self._download_webpage(webpage_url, video_id)
2327
2328         self.report_extraction(video_id)
2329         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2330                  webpage)
2331         if mobj is None:
2332             self._downloader.report_error(u'unable to extract media URL')
2333             return
2334         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2335
2336         mobj = re.search('<title>([^<]+)</title>', webpage)
2337         if mobj is None:
2338             self._downloader.report_error(u'unable to extract title')
2339             return
2340
2341         video_title = mobj.group(1)
2342
2343         return [{
2344             'id':       video_id,
2345             'url':      video_url,
2346             'uploader': None,
2347             'upload_date':  None,
2348             'title':    video_title,
2349             'ext':      u'flv',
2350         }]
2351
2352 class ComedyCentralIE(InfoExtractor):
2353     """Information extractor for The Daily Show and Colbert Report """
2354
2355     # urls can be abbreviations like :thedailyshow or :colbert
2356     # urls for episodes like:
2357     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2358     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2359     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2360     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2361                       |(https?://)?(www\.)?
2362                           (?P<showname>thedailyshow|colbertnation)\.com/
2363                          (full-episodes/(?P<episode>.*)|
2364                           (?P<clip>
2365                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2366                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2367                      $"""
2368
2369     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2370
2371     _video_extensions = {
2372         '3500': 'mp4',
2373         '2200': 'mp4',
2374         '1700': 'mp4',
2375         '1200': 'mp4',
2376         '750': 'mp4',
2377         '400': 'mp4',
2378     }
2379     _video_dimensions = {
2380         '3500': '1280x720',
2381         '2200': '960x540',
2382         '1700': '768x432',
2383         '1200': '640x360',
2384         '750': '512x288',
2385         '400': '384x216',
2386     }
2387
2388     @classmethod
2389     def suitable(cls, url):
2390         """Receives a URL and returns True if suitable for this IE."""
2391         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2392
2393     def report_extraction(self, episode_id):
2394         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2395
2396     def report_config_download(self, episode_id, media_id):
2397         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2398
2399     def report_index_download(self, episode_id):
2400         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2401
2402     def _print_formats(self, formats):
2403         print('Available formats:')
2404         for x in formats:
2405             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2406
2407
2408     def _real_extract(self, url):
2409         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2410         if mobj is None:
2411             self._downloader.report_error(u'invalid URL: %s' % url)
2412             return
2413
2414         if mobj.group('shortname'):
2415             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2416                 url = u'http://www.thedailyshow.com/full-episodes/'
2417             else:
2418                 url = u'http://www.colbertnation.com/full-episodes/'
2419             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2420             assert mobj is not None
2421
2422         if mobj.group('clip'):
2423             if mobj.group('showname') == 'thedailyshow':
2424                 epTitle = mobj.group('tdstitle')
2425             else:
2426                 epTitle = mobj.group('cntitle')
2427             dlNewest = False
2428         else:
2429             dlNewest = not mobj.group('episode')
2430             if dlNewest:
2431                 epTitle = mobj.group('showname')
2432             else:
2433                 epTitle = mobj.group('episode')
2434
2435         req = compat_urllib_request.Request(url)
2436         self.report_extraction(epTitle)
2437         try:
2438             htmlHandle = compat_urllib_request.urlopen(req)
2439             html = htmlHandle.read()
2440             webpage = html.decode('utf-8')
2441         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2443             return
2444         if dlNewest:
2445             url = htmlHandle.geturl()
2446             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2447             if mobj is None:
2448                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2449                 return
2450             if mobj.group('episode') == '':
2451                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2452                 return
2453             epTitle = mobj.group('episode')
2454
2455         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2456
2457         if len(mMovieParams) == 0:
2458             # The Colbert Report embeds the information in a without
2459             # a URL prefix; so extract the alternate reference
2460             # and then add the URL prefix manually.
2461
2462             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2463             if len(altMovieParams) == 0:
2464                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2465                 return
2466             else:
2467                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2468
2469         uri = mMovieParams[0][1]
2470         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2471         self.report_index_download(epTitle)
2472         try:
2473             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2476             return
2477
2478         results = []
2479
2480         idoc = xml.etree.ElementTree.fromstring(indexXml)
2481         itemEls = idoc.findall('.//item')
2482         for partNum,itemEl in enumerate(itemEls):
2483             mediaId = itemEl.findall('./guid')[0].text
2484             shortMediaId = mediaId.split(':')[-1]
2485             showId = mediaId.split(':')[-2].replace('.com', '')
2486             officialTitle = itemEl.findall('./title')[0].text
2487             officialDate = itemEl.findall('./pubDate')[0].text
2488
2489             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2490                         compat_urllib_parse.urlencode({'uri': mediaId}))
2491             configReq = compat_urllib_request.Request(configUrl)
2492             self.report_config_download(epTitle, shortMediaId)
2493             try:
2494                 configXml = compat_urllib_request.urlopen(configReq).read()
2495             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2496                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2497                 return
2498
2499             cdoc = xml.etree.ElementTree.fromstring(configXml)
2500             turls = []
2501             for rendition in cdoc.findall('.//rendition'):
2502                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2503                 turls.append(finfo)
2504
2505             if len(turls) == 0:
2506                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2507                 continue
2508
2509             if self._downloader.params.get('listformats', None):
2510                 self._print_formats([i[0] for i in turls])
2511                 return
2512
2513             # For now, just pick the highest bitrate
2514             format,rtmp_video_url = turls[-1]
2515
2516             # Get the format arg from the arg stream
2517             req_format = self._downloader.params.get('format', None)
2518
2519             # Select format if we can find one
2520             for f,v in turls:
2521                 if f == req_format:
2522                     format, rtmp_video_url = f, v
2523                     break
2524
2525             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2526             if not m:
2527                 raise ExtractorError(u'Cannot transform RTMP url')
2528             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2529             video_url = base + m.group('finalid')
2530
2531             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2532             info = {
2533                 'id': shortMediaId,
2534                 'url': video_url,
2535                 'uploader': showId,
2536                 'upload_date': officialDate,
2537                 'title': effTitle,
2538                 'ext': 'mp4',
2539                 'format': format,
2540                 'thumbnail': None,
2541                 'description': officialTitle,
2542             }
2543             results.append(info)
2544
2545         return results
2546
2547
2548 class EscapistIE(InfoExtractor):
2549     """Information extractor for The Escapist """
2550
2551     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2552     IE_NAME = u'escapist'
2553
2554     def report_extraction(self, showName):
2555         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2556
2557     def report_config_download(self, showName):
2558         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2559
2560     def _real_extract(self, url):
2561         mobj = re.match(self._VALID_URL, url)
2562         if mobj is None:
2563             self._downloader.report_error(u'invalid URL: %s' % url)
2564             return
2565         showName = mobj.group('showname')
2566         videoId = mobj.group('episode')
2567
2568         self.report_extraction(showName)
2569         try:
2570             webPage = compat_urllib_request.urlopen(url)
2571             webPageBytes = webPage.read()
2572             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2573             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2574         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2575             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2576             return
2577
2578         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2579         description = unescapeHTML(descMatch.group(1))
2580         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2581         imgUrl = unescapeHTML(imgMatch.group(1))
2582         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2583         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2584         configUrlMatch = re.search('config=(.*)$', playerUrl)
2585         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2586
2587         self.report_config_download(showName)
2588         try:
2589             configJSON = compat_urllib_request.urlopen(configUrl)
2590             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2591             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2592         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2593             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2594             return
2595
2596         # Technically, it's JavaScript, not JSON
2597         configJSON = configJSON.replace("'", '"')
2598
2599         try:
2600             config = json.loads(configJSON)
2601         except (ValueError,) as err:
2602             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2603             return
2604
2605         playlist = config['playlist']
2606         videoUrl = playlist[1]['url']
2607
2608         info = {
2609             'id': videoId,
2610             'url': videoUrl,
2611             'uploader': showName,
2612             'upload_date': None,
2613             'title': showName,
2614             'ext': 'mp4',
2615             'thumbnail': imgUrl,
2616             'description': description,
2617             'player_url': playerUrl,
2618         }
2619
2620         return [info]
2621
2622 class CollegeHumorIE(InfoExtractor):
2623     """Information extractor for collegehumor.com"""
2624
2625     _WORKING = False
2626     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2627     IE_NAME = u'collegehumor'
2628
2629     def report_manifest(self, video_id):
2630         """Report information extraction."""
2631         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2632
2633     def report_extraction(self, video_id):
2634         """Report information extraction."""
2635         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2636
2637     def _real_extract(self, url):
2638         mobj = re.match(self._VALID_URL, url)
2639         if mobj is None:
2640             self._downloader.report_error(u'invalid URL: %s' % url)
2641             return
2642         video_id = mobj.group('videoid')
2643
2644         info = {
2645             'id': video_id,
2646             'uploader': None,
2647             'upload_date': None,
2648         }
2649
2650         self.report_extraction(video_id)
2651         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2652         try:
2653             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2654         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2655             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2656             return
2657
2658         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2659         try:
2660             videoNode = mdoc.findall('./video')[0]
2661             info['description'] = videoNode.findall('./description')[0].text
2662             info['title'] = videoNode.findall('./caption')[0].text
2663             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2664             manifest_url = videoNode.findall('./file')[0].text
2665         except IndexError:
2666             self._downloader.report_error(u'Invalid metadata XML file')
2667             return
2668
2669         manifest_url += '?hdcore=2.10.3'
2670         self.report_manifest(video_id)
2671         try:
2672             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2673         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2675             return
2676
2677         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2678         try:
2679             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2680             node_id = media_node.attrib['url']
2681             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2682         except IndexError as err:
2683             self._downloader.report_error(u'Invalid manifest file')
2684             return
2685
2686         url_pr = compat_urllib_parse_urlparse(manifest_url)
2687         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2688
2689         info['url'] = url
2690         info['ext'] = 'f4f'
2691         return [info]
2692
2693
2694 class XVideosIE(InfoExtractor):
2695     """Information extractor for xvideos.com"""
2696
2697     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2698     IE_NAME = u'xvideos'
2699
2700     def report_extraction(self, video_id):
2701         """Report information extraction."""
2702         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2703
2704     def _real_extract(self, url):
2705         mobj = re.match(self._VALID_URL, url)
2706         if mobj is None:
2707             self._downloader.report_error(u'invalid URL: %s' % url)
2708             return
2709         video_id = mobj.group(1)
2710
2711         webpage = self._download_webpage(url, video_id)
2712
2713         self.report_extraction(video_id)
2714
2715
2716         # Extract video URL
2717         mobj = re.search(r'flv_url=(.+?)&', webpage)
2718         if mobj is None:
2719             self._downloader.report_error(u'unable to extract video url')
2720             return
2721         video_url = compat_urllib_parse.unquote(mobj.group(1))
2722
2723
2724         # Extract title
2725         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2726         if mobj is None:
2727             self._downloader.report_error(u'unable to extract video title')
2728             return
2729         video_title = mobj.group(1)
2730
2731
2732         # Extract video thumbnail
2733         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2734         if mobj is None:
2735             self._downloader.report_error(u'unable to extract video thumbnail')
2736             return
2737         video_thumbnail = mobj.group(0)
2738
2739         info = {
2740             'id': video_id,
2741             'url': video_url,
2742             'uploader': None,
2743             'upload_date': None,
2744             'title': video_title,
2745             'ext': 'flv',
2746             'thumbnail': video_thumbnail,
2747             'description': None,
2748         }
2749
2750         return [info]
2751
2752
2753 class SoundcloudIE(InfoExtractor):
2754     """Information extractor for soundcloud.com
2755        To access the media, the uid of the song and a stream token
2756        must be extracted from the page source and the script must make
2757        a request to media.soundcloud.com/crossdomain.xml. Then
2758        the media can be grabbed by requesting from an url composed
2759        of the stream token and uid
2760      """
2761
2762     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2763     IE_NAME = u'soundcloud'
2764
2765     def __init__(self, downloader=None):
2766         InfoExtractor.__init__(self, downloader)
2767
2768     def report_resolve(self, video_id):
2769         """Report information extraction."""
2770         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2771
2772     def report_extraction(self, video_id):
2773         """Report information extraction."""
2774         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2775
2776     def _real_extract(self, url):
2777         mobj = re.match(self._VALID_URL, url)
2778         if mobj is None:
2779             self._downloader.report_error(u'invalid URL: %s' % url)
2780             return
2781
2782         # extract uploader (which is in the url)
2783         uploader = mobj.group(1)
2784         # extract simple title (uploader + slug of song title)
2785         slug_title =  mobj.group(2)
2786         simple_title = uploader + u'-' + slug_title
2787
2788         self.report_resolve('%s/%s' % (uploader, slug_title))
2789
2790         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2791         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2792         request = compat_urllib_request.Request(resolv_url)
2793         try:
2794             info_json_bytes = compat_urllib_request.urlopen(request).read()
2795             info_json = info_json_bytes.decode('utf-8')
2796         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2797             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2798             return
2799
2800         info = json.loads(info_json)
2801         video_id = info['id']
2802         self.report_extraction('%s/%s' % (uploader, slug_title))
2803
2804         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2805         request = compat_urllib_request.Request(streams_url)
2806         try:
2807             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2808             stream_json = stream_json_bytes.decode('utf-8')
2809         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2811             return
2812
2813         streams = json.loads(stream_json)
2814         mediaURL = streams['http_mp3_128_url']
2815
2816         return [{
2817             'id':       info['id'],
2818             'url':      mediaURL,
2819             'uploader': info['user']['username'],
2820             'upload_date':  info['created_at'],
2821             'title':    info['title'],
2822             'ext':      u'mp3',
2823             'description': info['description'],
2824         }]
2825
2826 class SoundcloudSetIE(InfoExtractor):
2827     """Information extractor for soundcloud.com sets
2828        To access the media, the uid of the song and a stream token
2829        must be extracted from the page source and the script must make
2830        a request to media.soundcloud.com/crossdomain.xml. Then
2831        the media can be grabbed by requesting from an url composed
2832        of the stream token and uid
2833      """
2834
2835     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2836     IE_NAME = u'soundcloud'
2837
2838     def __init__(self, downloader=None):
2839         InfoExtractor.__init__(self, downloader)
2840
2841     def report_resolve(self, video_id):
2842         """Report information extraction."""
2843         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2844
2845     def report_extraction(self, video_id):
2846         """Report information extraction."""
2847         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2848
2849     def _real_extract(self, url):
2850         mobj = re.match(self._VALID_URL, url)
2851         if mobj is None:
2852             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2853             return
2854
2855         # extract uploader (which is in the url)
2856         uploader = mobj.group(1)
2857         # extract simple title (uploader + slug of song title)
2858         slug_title =  mobj.group(2)
2859         simple_title = uploader + u'-' + slug_title
2860
2861         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2862
2863         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2864         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2865         request = compat_urllib_request.Request(resolv_url)
2866         try:
2867             info_json_bytes = compat_urllib_request.urlopen(request).read()
2868             info_json = info_json_bytes.decode('utf-8')
2869         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2871             return
2872
2873         videos = []
2874         info = json.loads(info_json)
2875         if 'errors' in info:
2876             for err in info['errors']:
2877                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2878             return
2879
2880         for track in info['tracks']:
2881             video_id = track['id']
2882             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2883
2884             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2885             request = compat_urllib_request.Request(streams_url)
2886             try:
2887                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2888                 stream_json = stream_json_bytes.decode('utf-8')
2889             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2890                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2891                 return
2892
2893             streams = json.loads(stream_json)
2894             mediaURL = streams['http_mp3_128_url']
2895
2896             videos.append({
2897                 'id':       video_id,
2898                 'url':      mediaURL,
2899                 'uploader': track['user']['username'],
2900                 'upload_date':  track['created_at'],
2901                 'title':    track['title'],
2902                 'ext':      u'mp3',
2903                 'description': track['description'],
2904             })
2905         return videos
2906
2907
2908 class InfoQIE(InfoExtractor):
2909     """Information extractor for infoq.com"""
2910     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2911
2912     def report_extraction(self, video_id):
2913         """Report information extraction."""
2914         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2915
2916     def _real_extract(self, url):
2917         mobj = re.match(self._VALID_URL, url)
2918         if mobj is None:
2919             self._downloader.report_error(u'invalid URL: %s' % url)
2920             return
2921
2922         webpage = self._download_webpage(url, video_id=url)
2923         self.report_extraction(url)
2924
2925         # Extract video URL
2926         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2927         if mobj is None:
2928             self._downloader.report_error(u'unable to extract video url')
2929             return
2930         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2931         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2932
2933         # Extract title
2934         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2935         if mobj is None:
2936             self._downloader.report_error(u'unable to extract video title')
2937             return
2938         video_title = mobj.group(1)
2939
2940         # Extract description
2941         video_description = u'No description available.'
2942         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2943         if mobj is not None:
2944             video_description = mobj.group(1)
2945
2946         video_filename = video_url.split('/')[-1]
2947         video_id, extension = video_filename.split('.')
2948
2949         info = {
2950             'id': video_id,
2951             'url': video_url,
2952             'uploader': None,
2953             'upload_date': None,
2954             'title': video_title,
2955             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2956             'thumbnail': None,
2957             'description': video_description,
2958         }
2959
2960         return [info]
2961
2962 class MixcloudIE(InfoExtractor):
2963     """Information extractor for www.mixcloud.com"""
2964
2965     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2966     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2967     IE_NAME = u'mixcloud'
2968
2969     def __init__(self, downloader=None):
2970         InfoExtractor.__init__(self, downloader)
2971
2972     def report_download_json(self, file_id):
2973         """Report JSON download."""
2974         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2975
2976     def report_extraction(self, file_id):
2977         """Report information extraction."""
2978         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2979
2980     def get_urls(self, jsonData, fmt, bitrate='best'):
2981         """Get urls from 'audio_formats' section in json"""
2982         file_url = None
2983         try:
2984             bitrate_list = jsonData[fmt]
2985             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2986                 bitrate = max(bitrate_list) # select highest
2987
2988             url_list = jsonData[fmt][bitrate]
2989         except TypeError: # we have no bitrate info.
2990             url_list = jsonData[fmt]
2991         return url_list
2992
2993     def check_urls(self, url_list):
2994         """Returns 1st active url from list"""
2995         for url in url_list:
2996             try:
2997                 compat_urllib_request.urlopen(url)
2998                 return url
2999             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3000                 url = None
3001
3002         return None
3003
3004     def _print_formats(self, formats):
3005         print('Available formats:')
3006         for fmt in formats.keys():
3007             for b in formats[fmt]:
3008                 try:
3009                     ext = formats[fmt][b][0]
3010                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3011                 except TypeError: # we have no bitrate info
3012                     ext = formats[fmt][0]
3013                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3014                     break
3015
3016     def _real_extract(self, url):
3017         mobj = re.match(self._VALID_URL, url)
3018         if mobj is None:
3019             self._downloader.report_error(u'invalid URL: %s' % url)
3020             return
3021         # extract uploader & filename from url
3022         uploader = mobj.group(1).decode('utf-8')
3023         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3024
3025         # construct API request
3026         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3027         # retrieve .json file with links to files
3028         request = compat_urllib_request.Request(file_url)
3029         try:
3030             self.report_download_json(file_url)
3031             jsonData = compat_urllib_request.urlopen(request).read()
3032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3033             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3034             return
3035
3036         # parse JSON
3037         json_data = json.loads(jsonData)
3038         player_url = json_data['player_swf_url']
3039         formats = dict(json_data['audio_formats'])
3040
3041         req_format = self._downloader.params.get('format', None)
3042         bitrate = None
3043
3044         if self._downloader.params.get('listformats', None):
3045             self._print_formats(formats)
3046             return
3047
3048         if req_format is None or req_format == 'best':
3049             for format_param in formats.keys():
3050                 url_list = self.get_urls(formats, format_param)
3051                 # check urls
3052                 file_url = self.check_urls(url_list)
3053                 if file_url is not None:
3054                     break # got it!
3055         else:
3056             if req_format not in formats:
3057                 self._downloader.report_error(u'format is not available')
3058                 return
3059
3060             url_list = self.get_urls(formats, req_format)
3061             file_url = self.check_urls(url_list)
3062             format_param = req_format
3063
3064         return [{
3065             'id': file_id.decode('utf-8'),
3066             'url': file_url.decode('utf-8'),
3067             'uploader': uploader.decode('utf-8'),
3068             'upload_date': None,
3069             'title': json_data['name'],
3070             'ext': file_url.split('.')[-1].decode('utf-8'),
3071             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3072             'thumbnail': json_data['thumbnail_url'],
3073             'description': json_data['description'],
3074             'player_url': player_url.decode('utf-8'),
3075         }]
3076
3077 class StanfordOpenClassroomIE(InfoExtractor):
3078     """Information extractor for Stanford's Open ClassRoom"""
3079
3080     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3081     IE_NAME = u'stanfordoc'
3082
3083     def report_download_webpage(self, objid):
3084         """Report information extraction."""
3085         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3086
3087     def report_extraction(self, video_id):
3088         """Report information extraction."""
3089         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3090
3091     def _real_extract(self, url):
3092         mobj = re.match(self._VALID_URL, url)
3093         if mobj is None:
3094             raise ExtractorError(u'Invalid URL: %s' % url)
3095
3096         if mobj.group('course') and mobj.group('video'): # A specific video
3097             course = mobj.group('course')
3098             video = mobj.group('video')
3099             info = {
3100                 'id': course + '_' + video,
3101                 'uploader': None,
3102                 'upload_date': None,
3103             }
3104
3105             self.report_extraction(info['id'])
3106             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3107             xmlUrl = baseUrl + video + '.xml'
3108             try:
3109                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3110             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3111                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3112                 return
3113             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3114             try:
3115                 info['title'] = mdoc.findall('./title')[0].text
3116                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3117             except IndexError:
3118                 self._downloader.report_error(u'Invalid metadata XML file')
3119                 return
3120             info['ext'] = info['url'].rpartition('.')[2]
3121             return [info]
3122         elif mobj.group('course'): # A course page
3123             course = mobj.group('course')
3124             info = {
3125                 'id': course,
3126                 'type': 'playlist',
3127                 'uploader': None,
3128                 'upload_date': None,
3129             }
3130
3131             coursepage = self._download_webpage(url, info['id'],
3132                                         note='Downloading course info page',
3133                                         errnote='Unable to download course info page')
3134
3135             m = re.search('<h1>([^<]+)</h1>', coursepage)
3136             if m:
3137                 info['title'] = unescapeHTML(m.group(1))
3138             else:
3139                 info['title'] = info['id']
3140
3141             m = re.search('<description>([^<]+)</description>', coursepage)
3142             if m:
3143                 info['description'] = unescapeHTML(m.group(1))
3144
3145             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3146             info['list'] = [
3147                 {
3148                     'type': 'reference',
3149                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3150                 }
3151                     for vpage in links]
3152             results = []
3153             for entry in info['list']:
3154                 assert entry['type'] == 'reference'
3155                 results += self.extract(entry['url'])
3156             return results
3157         else: # Root page
3158             info = {
3159                 'id': 'Stanford OpenClassroom',
3160                 'type': 'playlist',
3161                 'uploader': None,
3162                 'upload_date': None,
3163             }
3164
3165             self.report_download_webpage(info['id'])
3166             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3167             try:
3168                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3169             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3170                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3171                 return
3172
3173             info['title'] = info['id']
3174
3175             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3176             info['list'] = [
3177                 {
3178                     'type': 'reference',
3179                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3180                 }
3181                     for cpage in links]
3182
3183             results = []
3184             for entry in info['list']:
3185                 assert entry['type'] == 'reference'
3186                 results += self.extract(entry['url'])
3187             return results
3188
3189 class MTVIE(InfoExtractor):
3190     """Information extractor for MTV.com"""
3191
3192     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3193     IE_NAME = u'mtv'
3194
3195     def report_extraction(self, video_id):
3196         """Report information extraction."""
3197         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3198
3199     def _real_extract(self, url):
3200         mobj = re.match(self._VALID_URL, url)
3201         if mobj is None:
3202             self._downloader.report_error(u'invalid URL: %s' % url)
3203             return
3204         if not mobj.group('proto'):
3205             url = 'http://' + url
3206         video_id = mobj.group('videoid')
3207
3208         webpage = self._download_webpage(url, video_id)
3209
3210         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3211         if mobj is None:
3212             self._downloader.report_error(u'unable to extract song name')
3213             return
3214         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3215         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3216         if mobj is None:
3217             self._downloader.report_error(u'unable to extract performer')
3218             return
3219         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3220         video_title = performer + ' - ' + song_name
3221
3222         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3223         if mobj is None:
3224             self._downloader.report_error(u'unable to mtvn_uri')
3225             return
3226         mtvn_uri = mobj.group(1)
3227
3228         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3229         if mobj is None:
3230             self._downloader.report_error(u'unable to extract content id')
3231             return
3232         content_id = mobj.group(1)
3233
3234         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3235         self.report_extraction(video_id)
3236         request = compat_urllib_request.Request(videogen_url)
3237         try:
3238             metadataXml = compat_urllib_request.urlopen(request).read()
3239         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3240             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3241             return
3242
3243         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3244         renditions = mdoc.findall('.//rendition')
3245
3246         # For now, always pick the highest quality.
3247         rendition = renditions[-1]
3248
3249         try:
3250             _,_,ext = rendition.attrib['type'].partition('/')
3251             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3252             video_url = rendition.find('./src').text
3253         except KeyError:
3254             self._downloader.trouble('Invalid rendition field.')
3255             return
3256
3257         info = {
3258             'id': video_id,
3259             'url': video_url,
3260             'uploader': performer,
3261             'upload_date': None,
3262             'title': video_title,
3263             'ext': ext,
3264             'format': format,
3265         }
3266
3267         return [info]
3268
3269
3270 class YoukuIE(InfoExtractor):
3271     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3272
3273     def report_download_webpage(self, file_id):
3274         """Report webpage download."""
3275         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3276
3277     def report_extraction(self, file_id):
3278         """Report information extraction."""
3279         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3280
3281     def _gen_sid(self):
3282         nowTime = int(time.time() * 1000)
3283         random1 = random.randint(1000,1998)
3284         random2 = random.randint(1000,9999)
3285
3286         return "%d%d%d" %(nowTime,random1,random2)
3287
3288     def _get_file_ID_mix_string(self, seed):
3289         mixed = []
3290         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3291         seed = float(seed)
3292         for i in range(len(source)):
3293             seed  =  (seed * 211 + 30031 ) % 65536
3294             index  =  math.floor(seed / 65536 * len(source) )
3295             mixed.append(source[int(index)])
3296             source.remove(source[int(index)])
3297         #return ''.join(mixed)
3298         return mixed
3299
3300     def _get_file_id(self, fileId, seed):
3301         mixed = self._get_file_ID_mix_string(seed)
3302         ids = fileId.split('*')
3303         realId = []
3304         for ch in ids:
3305             if ch:
3306                 realId.append(mixed[int(ch)])
3307         return ''.join(realId)
3308
3309     def _real_extract(self, url):
3310         mobj = re.match(self._VALID_URL, url)
3311         if mobj is None:
3312             self._downloader.report_error(u'invalid URL: %s' % url)
3313             return
3314         video_id = mobj.group('ID')
3315
3316         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3317
3318         request = compat_urllib_request.Request(info_url, None, std_headers)
3319         try:
3320             self.report_download_webpage(video_id)
3321             jsondata = compat_urllib_request.urlopen(request).read()
3322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3323             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3324             return
3325
3326         self.report_extraction(video_id)
3327         try:
3328             jsonstr = jsondata.decode('utf-8')
3329             config = json.loads(jsonstr)
3330
3331             video_title =  config['data'][0]['title']
3332             seed = config['data'][0]['seed']
3333
3334             format = self._downloader.params.get('format', None)
3335             supported_format = list(config['data'][0]['streamfileids'].keys())
3336
3337             if format is None or format == 'best':
3338                 if 'hd2' in supported_format:
3339                     format = 'hd2'
3340                 else:
3341                     format = 'flv'
3342                 ext = u'flv'
3343             elif format == 'worst':
3344                 format = 'mp4'
3345                 ext = u'mp4'
3346             else:
3347                 format = 'flv'
3348                 ext = u'flv'
3349
3350
3351             fileid = config['data'][0]['streamfileids'][format]
3352             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3353         except (UnicodeDecodeError, ValueError, KeyError):
3354             self._downloader.report_error(u'unable to extract info section')
3355             return
3356
3357         files_info=[]
3358         sid = self._gen_sid()
3359         fileid = self._get_file_id(fileid, seed)
3360
3361         #column 8,9 of fileid represent the segment number
3362         #fileid[7:9] should be changed
3363         for index, key in enumerate(keys):
3364
3365             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3366             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3367
3368             info = {
3369                 'id': '%s_part%02d' % (video_id, index),
3370                 'url': download_url,
3371                 'uploader': None,
3372                 'upload_date': None,
3373                 'title': video_title,
3374                 'ext': ext,
3375             }
3376             files_info.append(info)
3377
3378         return files_info
3379
3380
3381 class XNXXIE(InfoExtractor):
3382     """Information extractor for xnxx.com"""
3383
3384     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3385     IE_NAME = u'xnxx'
3386     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3387     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3388     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3389
3390     def report_webpage(self, video_id):
3391         """Report information extraction"""
3392         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3393
3394     def report_extraction(self, video_id):
3395         """Report information extraction"""
3396         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3397
3398     def _real_extract(self, url):
3399         mobj = re.match(self._VALID_URL, url)
3400         if mobj is None:
3401             self._downloader.report_error(u'invalid URL: %s' % url)
3402             return
3403         video_id = mobj.group(1)
3404
3405         self.report_webpage(video_id)
3406
3407         # Get webpage content
3408         try:
3409             webpage_bytes = compat_urllib_request.urlopen(url).read()
3410             webpage = webpage_bytes.decode('utf-8')
3411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3413             return
3414
3415         result = re.search(self.VIDEO_URL_RE, webpage)
3416         if result is None:
3417             self._downloader.report_error(u'unable to extract video url')
3418             return
3419         video_url = compat_urllib_parse.unquote(result.group(1))
3420
3421         result = re.search(self.VIDEO_TITLE_RE, webpage)
3422         if result is None:
3423             self._downloader.report_error(u'unable to extract video title')
3424             return
3425         video_title = result.group(1)
3426
3427         result = re.search(self.VIDEO_THUMB_RE, webpage)
3428         if result is None:
3429             self._downloader.report_error(u'unable to extract video thumbnail')
3430             return
3431         video_thumbnail = result.group(1)
3432
3433         return [{
3434             'id': video_id,
3435             'url': video_url,
3436             'uploader': None,
3437             'upload_date': None,
3438             'title': video_title,
3439             'ext': 'flv',
3440             'thumbnail': video_thumbnail,
3441             'description': None,
3442         }]
3443
3444
3445 class GooglePlusIE(InfoExtractor):
3446     """Information extractor for plus.google.com."""
3447
3448     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3449     IE_NAME = u'plus.google'
3450
3451     def __init__(self, downloader=None):
3452         InfoExtractor.__init__(self, downloader)
3453
3454     def report_extract_entry(self, url):
3455         """Report downloading extry"""
3456         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3457
3458     def report_date(self, upload_date):
3459         """Report downloading extry"""
3460         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3461
3462     def report_uploader(self, uploader):
3463         """Report downloading extry"""
3464         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3465
3466     def report_title(self, video_title):
3467         """Report downloading extry"""
3468         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3469
3470     def report_extract_vid_page(self, video_page):
3471         """Report information extraction."""
3472         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3473
3474     def _real_extract(self, url):
3475         # Extract id from URL
3476         mobj = re.match(self._VALID_URL, url)
3477         if mobj is None:
3478             self._downloader.report_error(u'Invalid URL: %s' % url)
3479             return
3480
3481         post_url = mobj.group(0)
3482         video_id = mobj.group(1)
3483
3484         video_extension = 'flv'
3485
3486         # Step 1, Retrieve post webpage to extract further information
3487         self.report_extract_entry(post_url)
3488         request = compat_urllib_request.Request(post_url)
3489         try:
3490             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3491         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3492             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3493             return
3494
3495         # Extract update date
3496         upload_date = None
3497         pattern = 'title="Timestamp">(.*?)</a>'
3498         mobj = re.search(pattern, webpage)
3499         if mobj:
3500             upload_date = mobj.group(1)
3501             # Convert timestring to a format suitable for filename
3502             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3503             upload_date = upload_date.strftime('%Y%m%d')
3504         self.report_date(upload_date)
3505
3506         # Extract uploader
3507         uploader = None
3508         pattern = r'rel\="author".*?>(.*?)</a>'
3509         mobj = re.search(pattern, webpage)
3510         if mobj:
3511             uploader = mobj.group(1)
3512         self.report_uploader(uploader)
3513
3514         # Extract title
3515         # Get the first line for title
3516         video_title = u'NA'
3517         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3518         mobj = re.search(pattern, webpage)
3519         if mobj:
3520             video_title = mobj.group(1)
3521         self.report_title(video_title)
3522
3523         # Step 2, Stimulate clicking the image box to launch video
3524         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3525         mobj = re.search(pattern, webpage)
3526         if mobj is None:
3527             self._downloader.report_error(u'unable to extract video page URL')
3528
3529         video_page = mobj.group(1)
3530         request = compat_urllib_request.Request(video_page)
3531         try:
3532             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3533         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3535             return
3536         self.report_extract_vid_page(video_page)
3537
3538
3539         # Extract video links on video page
3540         """Extract video links of all sizes"""
3541         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3542         mobj = re.findall(pattern, webpage)
3543         if len(mobj) == 0:
3544             self._downloader.report_error(u'unable to extract video links')
3545
3546         # Sort in resolution
3547         links = sorted(mobj)
3548
3549         # Choose the lowest of the sort, i.e. highest resolution
3550         video_url = links[-1]
3551         # Only get the url. The resolution part in the tuple has no use anymore
3552         video_url = video_url[-1]
3553         # Treat escaped \u0026 style hex
3554         try:
3555             video_url = video_url.decode("unicode_escape")
3556         except AttributeError: # Python 3
3557             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3558
3559
3560         return [{
3561             'id':       video_id,
3562             'url':      video_url,
3563             'uploader': uploader,
3564             'upload_date':  upload_date,
3565             'title':    video_title,
3566             'ext':      video_extension,
3567         }]
3568
3569 class NBAIE(InfoExtractor):
3570     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3571     IE_NAME = u'nba'
3572
3573     def _real_extract(self, url):
3574         mobj = re.match(self._VALID_URL, url)
3575         if mobj is None:
3576             self._downloader.report_error(u'invalid URL: %s' % url)
3577             return
3578
3579         video_id = mobj.group(1)
3580         if video_id.endswith('/index.html'):
3581             video_id = video_id[:-len('/index.html')]
3582
3583         webpage = self._download_webpage(url, video_id)
3584
3585         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3586         def _findProp(rexp, default=None):
3587             m = re.search(rexp, webpage)
3588             if m:
3589                 return unescapeHTML(m.group(1))
3590             else:
3591                 return default
3592
3593         shortened_video_id = video_id.rpartition('/')[2]
3594         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3595         info = {
3596             'id': shortened_video_id,
3597             'url': video_url,
3598             'ext': 'mp4',
3599             'title': title,
3600             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3601             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3602         }
3603         return [info]
3604
3605 class JustinTVIE(InfoExtractor):
3606     """Information extractor for justin.tv and twitch.tv"""
3607     # TODO: One broadcast may be split into multiple videos. The key
3608     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3609     # starts at 1 and increases. Can we treat all parts as one video?
3610
3611     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3612         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3613     _JUSTIN_PAGE_LIMIT = 100
3614     IE_NAME = u'justin.tv'
3615
3616     def report_extraction(self, file_id):
3617         """Report information extraction."""
3618         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3619
3620     def report_download_page(self, channel, offset):
3621         """Report attempt to download a single page of videos."""
3622         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3623                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3624
3625     # Return count of items, list of *valid* items
3626     def _parse_page(self, url):
3627         try:
3628             urlh = compat_urllib_request.urlopen(url)
3629             webpage_bytes = urlh.read()
3630             webpage = webpage_bytes.decode('utf-8', 'ignore')
3631         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3632             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3633             return
3634
3635         response = json.loads(webpage)
3636         if type(response) != list:
3637             error_text = response.get('error', 'unknown error')
3638             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3639             return
3640         info = []
3641         for clip in response:
3642             video_url = clip['video_file_url']
3643             if video_url:
3644                 video_extension = os.path.splitext(video_url)[1][1:]
3645                 video_date = re.sub('-', '', clip['start_time'][:10])
3646                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3647                 video_id = clip['id']
3648                 video_title = clip.get('title', video_id)
3649                 info.append({
3650                     'id': video_id,
3651                     'url': video_url,
3652                     'title': video_title,
3653                     'uploader': clip.get('channel_name', video_uploader_id),
3654                     'uploader_id': video_uploader_id,
3655                     'upload_date': video_date,
3656                     'ext': video_extension,
3657                 })
3658         return (len(response), info)
3659
3660     def _real_extract(self, url):
3661         mobj = re.match(self._VALID_URL, url)
3662         if mobj is None:
3663             self._downloader.report_error(u'invalid URL: %s' % url)
3664             return
3665
3666         api = 'http://api.justin.tv'
3667         video_id = mobj.group(mobj.lastindex)
3668         paged = False
3669         if mobj.lastindex == 1:
3670             paged = True
3671             api += '/channel/archives/%s.json'
3672         else:
3673             api += '/broadcast/by_archive/%s.json'
3674         api = api % (video_id,)
3675
3676         self.report_extraction(video_id)
3677
3678         info = []
3679         offset = 0
3680         limit = self._JUSTIN_PAGE_LIMIT
3681         while True:
3682             if paged:
3683                 self.report_download_page(video_id, offset)
3684             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3685             page_count, page_info = self._parse_page(page_url)
3686             info.extend(page_info)
3687             if not paged or page_count != limit:
3688                 break
3689             offset += limit
3690         return info
3691
3692 class FunnyOrDieIE(InfoExtractor):
3693     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3694
3695     def _real_extract(self, url):
3696         mobj = re.match(self._VALID_URL, url)
3697         if mobj is None:
3698             self._downloader.report_error(u'invalid URL: %s' % url)
3699             return
3700
3701         video_id = mobj.group('id')
3702         webpage = self._download_webpage(url, video_id)
3703
3704         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3705         if not m:
3706             self._downloader.report_error(u'unable to find video information')
3707         video_url = unescapeHTML(m.group('url'))
3708
3709         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3710         if not m:
3711             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3712             if not m:
3713                 self._downloader.trouble(u'Cannot find video title')
3714         title = clean_html(m.group('title'))
3715
3716         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3717         if m:
3718             desc = unescapeHTML(m.group('desc'))
3719         else:
3720             desc = None
3721
3722         info = {
3723             'id': video_id,
3724             'url': video_url,
3725             'ext': 'mp4',
3726             'title': title,
3727             'description': desc,
3728         }
3729         return [info]
3730
3731 class SteamIE(InfoExtractor):
3732     _VALID_URL = r"""http://store.steampowered.com/
3733                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3734                 (?P<gameID>\d+)/?
3735                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3736                 """
3737
3738     @classmethod
3739     def suitable(cls, url):
3740         """Receives a URL and returns True if suitable for this IE."""
3741         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3742
3743     def _real_extract(self, url):
3744         m = re.match(self._VALID_URL, url, re.VERBOSE)
3745         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3746         gameID = m.group('gameID')
3747         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3748         webpage = self._download_webpage(videourl, gameID)
3749         mweb = re.finditer(urlRE, webpage)
3750         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3751         titles = re.finditer(namesRE, webpage)
3752         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3753         thumbs = re.finditer(thumbsRE, webpage)
3754         videos = []
3755         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3756             video_id = vid.group('videoID')
3757             title = vtitle.group('videoName')
3758             video_url = vid.group('videoURL')
3759             video_thumb = thumb.group('thumbnail')
3760             if not video_url:
3761                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3762             info = {
3763                 'id':video_id,
3764                 'url':video_url,
3765                 'ext': 'flv',
3766                 'title': unescapeHTML(title),
3767                 'thumbnail': video_thumb
3768                   }
3769             videos.append(info)
3770         return videos
3771
3772 class UstreamIE(InfoExtractor):
3773     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3774     IE_NAME = u'ustream'
3775
3776     def _real_extract(self, url):
3777         m = re.match(self._VALID_URL, url)
3778         video_id = m.group('videoID')
3779         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3780         webpage = self._download_webpage(url, video_id)
3781         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3782         title = m.group('title')
3783         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3784         uploader = m.group('uploader')
3785         info = {
3786                 'id':video_id,
3787                 'url':video_url,
3788                 'ext': 'flv',
3789                 'title': title,
3790                 'uploader': uploader
3791                   }
3792         return [info]
3793
3794 class WorldStarHipHopIE(InfoExtractor):
3795     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3796     IE_NAME = u'WorldStarHipHop'
3797
3798     def _real_extract(self, url):
3799         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3800
3801         webpage_src = compat_urllib_request.urlopen(url).read()
3802         webpage_src = webpage_src.decode('utf-8')
3803
3804         mobj = re.search(_src_url, webpage_src)
3805
3806         m = re.match(self._VALID_URL, url)
3807         video_id = m.group('id')
3808
3809         if mobj is not None:
3810             video_url = mobj.group()
3811             if 'mp4' in video_url:
3812                 ext = 'mp4'
3813             else:
3814                 ext = 'flv'
3815         else:
3816             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3817             return
3818
3819         _title = r"""<title>(.*)</title>"""
3820
3821         mobj = re.search(_title, webpage_src)
3822
3823         if mobj is not None:
3824             title = mobj.group(1)
3825         else:
3826             title = 'World Start Hip Hop - %s' % time.ctime()
3827
3828         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3829         mobj = re.search(_thumbnail, webpage_src)
3830
3831         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3832         if mobj is not None:
3833             thumbnail = mobj.group(1)
3834         else:
3835             _title = r"""candytitles.*>(.*)</span>"""
3836             mobj = re.search(_title, webpage_src)
3837             if mobj is not None:
3838                 title = mobj.group(1)
3839             thumbnail = None
3840
3841         results = [{
3842                     'id': video_id,
3843                     'url' : video_url,
3844                     'title' : title,
3845                     'thumbnail' : thumbnail,
3846                     'ext' : ext,
3847                     }]
3848         return results
3849
3850 class RBMARadioIE(InfoExtractor):
3851     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3852
3853     def _real_extract(self, url):
3854         m = re.match(self._VALID_URL, url)
3855         video_id = m.group('videoID')
3856
3857         webpage = self._download_webpage(url, video_id)
3858         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3859         if not m:
3860             raise ExtractorError(u'Cannot find metadata')
3861         json_data = m.group(1)
3862
3863         try:
3864             data = json.loads(json_data)
3865         except ValueError as e:
3866             raise ExtractorError(u'Invalid JSON: ' + str(e))
3867
3868         video_url = data['akamai_url'] + '&cbr=256'
3869         url_parts = compat_urllib_parse_urlparse(video_url)
3870         video_ext = url_parts.path.rpartition('.')[2]
3871         info = {
3872                 'id': video_id,
3873                 'url': video_url,
3874                 'ext': video_ext,
3875                 'title': data['title'],
3876                 'description': data.get('teaser_text'),
3877                 'location': data.get('country_of_origin'),
3878                 'uploader': data.get('host', {}).get('name'),
3879                 'uploader_id': data.get('host', {}).get('slug'),
3880                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3881                 'duration': data.get('duration'),
3882         }
3883         return [info]
3884
3885
3886 class YouPornIE(InfoExtractor):
3887     """Information extractor for youporn.com."""
3888     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3889
3890     def _print_formats(self, formats):
3891         """Print all available formats"""
3892         print(u'Available formats:')
3893         print(u'ext\t\tformat')
3894         print(u'---------------------------------')
3895         for format in formats:
3896             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3897
3898     def _specific(self, req_format, formats):
3899         for x in formats:
3900             if(x["format"]==req_format):
3901                 return x
3902         return None
3903
3904     def _real_extract(self, url):
3905         mobj = re.match(self._VALID_URL, url)
3906         if mobj is None:
3907             self._downloader.report_error(u'invalid URL: %s' % url)
3908             return
3909
3910         video_id = mobj.group('videoid')
3911
3912         req = compat_urllib_request.Request(url)
3913         req.add_header('Cookie', 'age_verified=1')
3914         webpage = self._download_webpage(req, video_id)
3915
3916         # Get the video title
3917         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3918         if result is None:
3919             raise ExtractorError(u'Unable to extract video title')
3920         video_title = result.group('title').strip()
3921
3922         # Get the video date
3923         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3924         if result is None:
3925             self._downloader.report_warning(u'unable to extract video date')
3926             upload_date = None
3927         else:
3928             upload_date = result.group('date').strip()
3929
3930         # Get the video uploader
3931         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3932         if result is None:
3933             self._downloader.report_warning(u'unable to extract uploader')
3934             video_uploader = None
3935         else:
3936             video_uploader = result.group('uploader').strip()
3937             video_uploader = clean_html( video_uploader )
3938
3939         # Get all of the formats available
3940         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3941         result = re.search(DOWNLOAD_LIST_RE, webpage)
3942         if result is None:
3943             raise ExtractorError(u'Unable to extract download list')
3944         download_list_html = result.group('download_list').strip()
3945
3946         # Get all of the links from the page
3947         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3948         links = re.findall(LINK_RE, download_list_html)
3949         if(len(links) == 0):
3950             raise ExtractorError(u'ERROR: no known formats available for video')
3951
3952         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3953
3954         formats = []
3955         for link in links:
3956
3957             # A link looks like this:
3958             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3959             # A path looks like this:
3960             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3961             video_url = unescapeHTML( link )
3962             path = compat_urllib_parse_urlparse( video_url ).path
3963             extension = os.path.splitext( path )[1][1:]
3964             format = path.split('/')[4].split('_')[:2]
3965             size = format[0]
3966             bitrate = format[1]
3967             format = "-".join( format )
3968             title = u'%s-%s-%s' % (video_title, size, bitrate)
3969
3970             formats.append({
3971                 'id': video_id,
3972                 'url': video_url,
3973                 'uploader': video_uploader,
3974                 'upload_date': upload_date,
3975                 'title': title,
3976                 'ext': extension,
3977                 'format': format,
3978                 'thumbnail': None,
3979                 'description': None,
3980                 'player_url': None
3981             })
3982
3983         if self._downloader.params.get('listformats', None):
3984             self._print_formats(formats)
3985             return
3986
3987         req_format = self._downloader.params.get('format', None)
3988         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3989
3990         if req_format is None or req_format == 'best':
3991             return [formats[0]]
3992         elif req_format == 'worst':
3993             return [formats[-1]]
3994         elif req_format in ('-1', 'all'):
3995             return formats
3996         else:
3997             format = self._specific( req_format, formats )
3998             if result is None:
3999                 self._downloader.report_error(u'requested format not available')
4000                 return
4001             return [format]
4002
4003
4004
4005 class PornotubeIE(InfoExtractor):
4006     """Information extractor for pornotube.com."""
4007     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4008
4009     def _real_extract(self, url):
4010         mobj = re.match(self._VALID_URL, url)
4011         if mobj is None:
4012             self._downloader.report_error(u'invalid URL: %s' % url)
4013             return
4014
4015         video_id = mobj.group('videoid')
4016         video_title = mobj.group('title')
4017
4018         # Get webpage content
4019         webpage = self._download_webpage(url, video_id)
4020
4021         # Get the video URL
4022         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4023         result = re.search(VIDEO_URL_RE, webpage)
4024         if result is None:
4025             self._downloader.report_error(u'unable to extract video url')
4026             return
4027         video_url = compat_urllib_parse.unquote(result.group('url'))
4028
4029         #Get the uploaded date
4030         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4031         result = re.search(VIDEO_UPLOADED_RE, webpage)
4032         if result is None:
4033             self._downloader.report_error(u'unable to extract video title')
4034             return
4035         upload_date = result.group('date')
4036
4037         info = {'id': video_id,
4038                 'url': video_url,
4039                 'uploader': None,
4040                 'upload_date': upload_date,
4041                 'title': video_title,
4042                 'ext': 'flv',
4043                 'format': 'flv'}
4044
4045         return [info]
4046
4047 class YouJizzIE(InfoExtractor):
4048     """Information extractor for youjizz.com."""
4049     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4050
4051     def _real_extract(self, url):
4052         mobj = re.match(self._VALID_URL, url)
4053         if mobj is None:
4054             self._downloader.report_error(u'invalid URL: %s' % url)
4055             return
4056
4057         video_id = mobj.group('videoid')
4058
4059         # Get webpage content
4060         webpage = self._download_webpage(url, video_id)
4061
4062         # Get the video title
4063         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4064         if result is None:
4065             raise ExtractorError(u'ERROR: unable to extract video title')
4066         video_title = result.group('title').strip()
4067
4068         # Get the embed page
4069         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4070         if result is None:
4071             raise ExtractorError(u'ERROR: unable to extract embed page')
4072
4073         embed_page_url = result.group(0).strip()
4074         video_id = result.group('videoid')
4075
4076         webpage = self._download_webpage(embed_page_url, video_id)
4077
4078         # Get the video URL
4079         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4080         if result is None:
4081             raise ExtractorError(u'ERROR: unable to extract video url')
4082         video_url = result.group('source')
4083
4084         info = {'id': video_id,
4085                 'url': video_url,
4086                 'title': video_title,
4087                 'ext': 'flv',
4088                 'format': 'flv',
4089                 'player_url': embed_page_url}
4090
4091         return [info]
4092
4093 class EightTracksIE(InfoExtractor):
4094     IE_NAME = '8tracks'
4095     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4096
4097     def _real_extract(self, url):
4098         mobj = re.match(self._VALID_URL, url)
4099         if mobj is None:
4100             raise ExtractorError(u'Invalid URL: %s' % url)
4101         playlist_id = mobj.group('id')
4102
4103         webpage = self._download_webpage(url, playlist_id)
4104
4105         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4106         if not m:
4107             raise ExtractorError(u'Cannot find trax information')
4108         json_like = m.group(1)
4109         data = json.loads(json_like)
4110
4111         session = str(random.randint(0, 1000000000))
4112         mix_id = data['id']
4113         track_count = data['tracks_count']
4114         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4115         next_url = first_url
4116         res = []
4117         for i in itertools.count():
4118             api_json = self._download_webpage(next_url, playlist_id,
4119                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4120                 errnote=u'Failed to download song information')
4121             api_data = json.loads(api_json)
4122             track_data = api_data[u'set']['track']
4123             info = {
4124                 'id': track_data['id'],
4125                 'url': track_data['track_file_stream_url'],
4126                 'title': track_data['performer'] + u' - ' + track_data['name'],
4127                 'raw_title': track_data['name'],
4128                 'uploader_id': data['user']['login'],
4129                 'ext': 'm4a',
4130             }
4131             res.append(info)
4132             if api_data['set']['at_last_track']:
4133                 break
4134             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4135         return res
4136
4137 class KeekIE(InfoExtractor):
4138     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4139     IE_NAME = u'keek'
4140
4141     def _real_extract(self, url):
4142         m = re.match(self._VALID_URL, url)
4143         video_id = m.group('videoID')
4144         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4145         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4146         webpage = self._download_webpage(url, video_id)
4147         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4148         title = unescapeHTML(m.group('title'))
4149         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4150         uploader = clean_html(m.group('uploader'))
4151         info = {
4152                 'id': video_id,
4153                 'url': video_url,
4154                 'ext': 'mp4',
4155                 'title': title,
4156                 'thumbnail': thumbnail,
4157                 'uploader': uploader
4158         }
4159         return [info]
4160
4161 class TEDIE(InfoExtractor):
4162     _VALID_URL=r'''http://www.ted.com/
4163                    (
4164                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4165                         |
4166                         ((?P<type_talk>talks)) # We have a simple talk
4167                    )
4168                    /(?P<name>\w+) # Here goes the name and then ".html"
4169                    '''
4170
4171     @classmethod
4172     def suitable(cls, url):
4173         """Receives a URL and returns True if suitable for this IE."""
4174         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4175
4176     def _real_extract(self, url):
4177         m=re.match(self._VALID_URL, url, re.VERBOSE)
4178         if m.group('type_talk'):
4179             return [self._talk_info(url)]
4180         else :
4181             playlist_id=m.group('playlist_id')
4182             name=m.group('name')
4183             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4184             return self._playlist_videos_info(url,name,playlist_id)
4185
4186     def _talk_video_link(self,mediaSlug):
4187         '''Returns the video link for that mediaSlug'''
4188         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4189
4190     def _playlist_videos_info(self,url,name,playlist_id=0):
4191         '''Returns the videos of the playlist'''
4192         video_RE=r'''
4193                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4194                      ([.\s]*?)data-playlist_item_id="(\d+)"
4195                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4196                      '''
4197         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4198         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4199         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4200         m_names=re.finditer(video_name_RE,webpage)
4201         info=[]
4202         for m_video, m_name in zip(m_videos,m_names):
4203             video_id=m_video.group('video_id')
4204             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4205             info.append(self._talk_info(talk_url,video_id))
4206         return info
4207
4208     def _talk_info(self, url, video_id=0):
4209         """Return the video for the talk in the url"""
4210         m=re.match(self._VALID_URL, url,re.VERBOSE)
4211         videoName=m.group('name')
4212         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4213         # If the url includes the language we get the title translated
4214         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4215         title=re.search(title_RE, webpage).group('title')
4216         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4217                         "id":(?P<videoID>[\d]+).*?
4218                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4219         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4220         thumb_match=re.search(thumb_RE,webpage)
4221         info_match=re.search(info_RE,webpage,re.VERBOSE)
4222         video_id=info_match.group('videoID')
4223         mediaSlug=info_match.group('mediaSlug')
4224         video_url=self._talk_video_link(mediaSlug)
4225         info = {
4226                 'id': video_id,
4227                 'url': video_url,
4228                 'ext': 'mp4',
4229                 'title': title,
4230                 'thumbnail': thumb_match.group('thumbnail')
4231                 }
4232         return info
4233
4234 class MySpassIE(InfoExtractor):
4235     _VALID_URL = r'http://www.myspass.de/.*'
4236
4237     def _real_extract(self, url):
4238         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4239
4240         # video id is the last path element of the URL
4241         # usually there is a trailing slash, so also try the second but last
4242         url_path = compat_urllib_parse_urlparse(url).path
4243         url_parent_path, video_id = os.path.split(url_path)
4244         if not video_id:
4245             _, video_id = os.path.split(url_parent_path)
4246
4247         # get metadata
4248         metadata_url = META_DATA_URL_TEMPLATE % video_id
4249         metadata_text = self._download_webpage(metadata_url, video_id)
4250         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4251
4252         # extract values from metadata
4253         url_flv_el = metadata.find('url_flv')
4254         if url_flv_el is None:
4255             self._downloader.report_error(u'unable to extract download url')
4256             return
4257         video_url = url_flv_el.text
4258         extension = os.path.splitext(video_url)[1][1:]
4259         title_el = metadata.find('title')
4260         if title_el is None:
4261             self._downloader.report_error(u'unable to extract title')
4262             return
4263         title = title_el.text
4264         format_id_el = metadata.find('format_id')
4265         if format_id_el is None:
4266             format = ext
4267         else:
4268             format = format_id_el.text
4269         description_el = metadata.find('description')
4270         if description_el is not None:
4271             description = description_el.text
4272         else:
4273             description = None
4274         imagePreview_el = metadata.find('imagePreview')
4275         if imagePreview_el is not None:
4276             thumbnail = imagePreview_el.text
4277         else:
4278             thumbnail = None
4279         info = {
4280             'id': video_id,
4281             'url': video_url,
4282             'title': title,
4283             'ext': extension,
4284             'format': format,
4285             'thumbnail': thumbnail,
4286             'description': description
4287         }
4288         return [info]
4289
4290 class SpiegelIE(InfoExtractor):
4291     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4292
4293     def _real_extract(self, url):
4294         m = re.match(self._VALID_URL, url)
4295         video_id = m.group('videoID')
4296
4297         webpage = self._download_webpage(url, video_id)
4298         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4299         if not m:
4300             raise ExtractorError(u'Cannot find title')
4301         video_title = unescapeHTML(m.group(1))
4302
4303         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4304         xml_code = self._download_webpage(xml_url, video_id,
4305                     note=u'Downloading XML', errnote=u'Failed to download XML')
4306
4307         idoc = xml.etree.ElementTree.fromstring(xml_code)
4308         last_type = idoc[-1]
4309         filename = last_type.findall('./filename')[0].text
4310         duration = float(last_type.findall('./duration')[0].text)
4311
4312         video_url = 'http://video2.spiegel.de/flash/' + filename
4313         video_ext = filename.rpartition('.')[2]
4314         info = {
4315             'id': video_id,
4316             'url': video_url,
4317             'ext': video_ext,
4318             'title': video_title,
4319             'duration': duration,
4320         }
4321         return [info]
4322
4323 class LiveLeakIE(InfoExtractor):
4324
4325     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4326     IE_NAME = u'liveleak'
4327
4328     def _real_extract(self, url):
4329         mobj = re.match(self._VALID_URL, url)
4330         if mobj is None:
4331             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4332             return
4333
4334         video_id = mobj.group('video_id')
4335
4336         webpage = self._download_webpage(url, video_id)
4337
4338         m = re.search(r'file: "(.*?)",', webpage)
4339         if not m:
4340             self._downloader.report_error(u'unable to find video url')
4341             return
4342         video_url = m.group(1)
4343
4344         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4345         if not m:
4346             self._downloader.trouble(u'Cannot find video title')
4347         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4348
4349         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4350         if m:
4351             desc = unescapeHTML(m.group('desc'))
4352         else:
4353             desc = None
4354
4355         m = re.search(r'By:.*?(\w+)</a>', webpage)
4356         if m:
4357             uploader = clean_html(m.group(1))
4358         else:
4359             uploader = None
4360
4361         info = {
4362             'id':  video_id,
4363             'url': video_url,
4364             'ext': 'mp4',
4365             'title': title,
4366             'description': desc,
4367             'uploader': uploader
4368         }
4369
4370         return [info]
4371
4372 class ARDIE(InfoExtractor):
4373     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4374     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4375     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4376
4377     def _real_extract(self, url):
4378         # determine video id from url
4379         m = re.match(self._VALID_URL, url)
4380
4381         numid = re.search(r'documentId=([0-9]+)', url)
4382         if numid:
4383             video_id = numid.group(1)
4384         else:
4385             video_id = m.group('video_id')
4386
4387         # determine title and media streams from webpage
4388         html = self._download_webpage(url, video_id)
4389         title = re.search(self._TITLE, html).group('title')
4390         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4391         if not streams:
4392             assert '"fsk"' in html
4393             self._downloader.report_error(u'this video is only available after 8:00 pm')
4394             return
4395
4396         # choose default media type and highest quality for now
4397         stream = max([s for s in streams if int(s["media_type"]) == 0],
4398                      key=lambda s: int(s["quality"]))
4399
4400         # there's two possibilities: RTMP stream or HTTP download
4401         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4402         if stream['rtmp_url']:
4403             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4404             assert stream['video_url'].startswith('mp4:')
4405             info["url"] = stream["rtmp_url"]
4406             info["play_path"] = stream['video_url']
4407         else:
4408             assert stream["video_url"].endswith('.mp4')
4409             info["url"] = stream["video_url"]
4410         return [info]
4411
4412
4413 def gen_extractors():
4414     """ Return a list of an instance of every supported extractor.
4415     The order does matter; the first extractor matched is the one handling the URL.
4416     """
4417     return [
4418         YoutubePlaylistIE(),
4419         YoutubeChannelIE(),
4420         YoutubeUserIE(),
4421         YoutubeSearchIE(),
4422         YoutubeIE(),
4423         MetacafeIE(),
4424         DailymotionIE(),
4425         GoogleSearchIE(),
4426         PhotobucketIE(),
4427         YahooIE(),
4428         YahooSearchIE(),
4429         DepositFilesIE(),
4430         FacebookIE(),
4431         BlipTVUserIE(),
4432         BlipTVIE(),
4433         VimeoIE(),
4434         MyVideoIE(),
4435         ComedyCentralIE(),
4436         EscapistIE(),
4437         CollegeHumorIE(),
4438         XVideosIE(),
4439         SoundcloudSetIE(),
4440         SoundcloudIE(),
4441         InfoQIE(),
4442         MixcloudIE(),
4443         StanfordOpenClassroomIE(),
4444         MTVIE(),
4445         YoukuIE(),
4446         XNXXIE(),
4447         YouJizzIE(),
4448         PornotubeIE(),
4449         YouPornIE(),
4450         GooglePlusIE(),
4451         ArteTvIE(),
4452         NBAIE(),
4453         WorldStarHipHopIE(),
4454         JustinTVIE(),
4455         FunnyOrDieIE(),
4456         SteamIE(),
4457         UstreamIE(),
4458         RBMARadioIE(),
4459         EightTracksIE(),
4460         KeekIE(),
4461         TEDIE(),
4462         MySpassIE(),
4463         SpiegelIE(),
4464         LiveLeakIE(),
4465         ARDIE(),
4466         GenericIE()
4467     ]