Fix crash when subtitles are not found and the option --all-subs is given
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
228
229     def report_video_subtitles_request(self, video_id, sub_lang, format):
230         """Report attempt to download video info webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
232
233     def report_video_subtitles_available(self, video_id, sub_lang_list):
234         """Report available subtitles."""
235         sub_lang = ",".join(list(sub_lang_list.keys()))
236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
237
238     def report_information_extraction(self, video_id):
239         """Report attempt to extract video information."""
240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
241
242     def report_unavailable_format(self, video_id, format):
243         """Report extracted video URL."""
244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
245
246     def report_rtmp_download(self):
247         """Indicate the download will use the RTMP protocol."""
248         self._downloader.to_screen(u'[youtube] RTMP download detected')
249
250     def _get_available_subtitles(self, video_id):
251         self.report_video_subtitles_download(video_id)
252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253         try:
254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256             return (u'unable to download video subtitles: %s' % compat_str(err), None)
257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259         if not sub_lang_list:
260             return (u'video doesn\'t have subtitles', None)
261         return sub_lang_list
262
263     def _list_available_subtitles(self, video_id):
264         sub_lang_list = self._get_available_subtitles(video_id)
265         self.report_video_subtitles_available(video_id, sub_lang_list)
266
267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268         """
269         Return tuple:
270         (error_message, sub_lang, sub)
271         """
272         self.report_video_subtitles_request(video_id, sub_lang, format)
273         params = compat_urllib_parse.urlencode({
274             'lang': sub_lang,
275             'name': sub_name,
276             'v': video_id,
277             'fmt': format,
278         })
279         url = 'http://www.youtube.com/api/timedtext?' + params
280         try:
281             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
282         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
283             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
284         if not sub:
285             return (u'Did not fetch video subtitles', None, None)
286         return (None, sub_lang, sub)
287
288     def _extract_subtitle(self, video_id):
289         """
290         Return a list with a tuple:
291         [(error_message, sub_lang, sub)]
292         """
293         sub_lang_list = self._get_available_subtitles(video_id)
294         sub_format = self._downloader.params.get('subtitlesformat')
295         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
296             return [(sub_lang_list[0], None, None)]
297         if self._downloader.params.get('subtitleslang', False):
298             sub_lang = self._downloader.params.get('subtitleslang')
299         elif 'en' in sub_lang_list:
300             sub_lang = 'en'
301         else:
302             sub_lang = list(sub_lang_list.keys())[0]
303         if not sub_lang in sub_lang_list:
304             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
305
306         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
307         return [subtitle]
308
309     def _extract_all_subtitles(self, video_id):
310         sub_lang_list = self._get_available_subtitles(video_id)
311         sub_format = self._downloader.params.get('subtitlesformat')
312         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
313             return [(sub_lang_list[0], None, None)]
314         subtitles = []
315         for sub_lang in sub_lang_list:
316             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
317             subtitles.append(subtitle)
318         return subtitles
319
320     def _print_formats(self, formats):
321         print('Available formats:')
322         for x in formats:
323             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
324
325     def _real_initialize(self):
326         if self._downloader is None:
327             return
328
329         username = None
330         password = None
331         downloader_params = self._downloader.params
332
333         # Attempt to use provided username and password or .netrc data
334         if downloader_params.get('username', None) is not None:
335             username = downloader_params['username']
336             password = downloader_params['password']
337         elif downloader_params.get('usenetrc', False):
338             try:
339                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
340                 if info is not None:
341                     username = info[0]
342                     password = info[2]
343                 else:
344                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
345             except (IOError, netrc.NetrcParseError) as err:
346                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
347                 return
348
349         # Set language
350         request = compat_urllib_request.Request(self._LANG_URL)
351         try:
352             self.report_lang()
353             compat_urllib_request.urlopen(request).read()
354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
355             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
356             return
357
358         # No authentication to be performed
359         if username is None:
360             return
361
362         request = compat_urllib_request.Request(self._LOGIN_URL)
363         try:
364             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
365         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
366             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
367             return
368
369         galx = None
370         dsh = None
371         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
372         if match:
373           galx = match.group(1)
374
375         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
376         if match:
377           dsh = match.group(1)
378
379         # Log in
380         login_form_strs = {
381                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
382                 u'Email': username,
383                 u'GALX': galx,
384                 u'Passwd': password,
385                 u'PersistentCookie': u'yes',
386                 u'_utf8': u'霱',
387                 u'bgresponse': u'js_disabled',
388                 u'checkConnection': u'',
389                 u'checkedDomains': u'youtube',
390                 u'dnConn': u'',
391                 u'dsh': dsh,
392                 u'pstMsg': u'0',
393                 u'rmShown': u'1',
394                 u'secTok': u'',
395                 u'signIn': u'Sign in',
396                 u'timeStmp': u'',
397                 u'service': u'youtube',
398                 u'uilel': u'3',
399                 u'hl': u'en_US',
400         }
401         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
402         # chokes on unicode
403         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
404         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
405         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
406         try:
407             self.report_login()
408             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
409             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
410                 self._downloader.report_warning(u'unable to log in: bad username or password')
411                 return
412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
413             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
414             return
415
416         # Confirm age
417         age_form = {
418                 'next_url':     '/',
419                 'action_confirm':   'Confirm',
420                 }
421         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
422         try:
423             self.report_age_confirmation()
424             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
425         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
426             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
427             return
428
429     def _extract_id(self, url):
430         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
431         if mobj is None:
432             self._downloader.report_error(u'invalid URL: %s' % url)
433             return
434         video_id = mobj.group(2)
435         return video_id
436
437     def _real_extract(self, url):
438         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
439         mobj = re.search(self._NEXT_URL_RE, url)
440         if mobj:
441             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
442         video_id = self._extract_id(url)
443
444         # Get video webpage
445         self.report_video_webpage_download(video_id)
446         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
447         request = compat_urllib_request.Request(url)
448         try:
449             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
451             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
452             return
453
454         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
455
456         # Attempt to extract SWF player URL
457         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
458         if mobj is not None:
459             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
460         else:
461             player_url = None
462
463         # Get video info
464         self.report_video_info_webpage_download(video_id)
465         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
466             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
467                     % (video_id, el_type))
468             request = compat_urllib_request.Request(video_info_url)
469             try:
470                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
471                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
472                 video_info = compat_parse_qs(video_info_webpage)
473                 if 'token' in video_info:
474                     break
475             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
476                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
477                 return
478         if 'token' not in video_info:
479             if 'reason' in video_info:
480                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
481             else:
482                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
483             return
484
485         # Check for "rental" videos
486         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
487             self._downloader.report_error(u'"rental" videos not supported')
488             return
489
490         # Start extracting information
491         self.report_information_extraction(video_id)
492
493         # uploader
494         if 'author' not in video_info:
495             self._downloader.report_error(u'unable to extract uploader name')
496             return
497         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
498
499         # uploader_id
500         video_uploader_id = None
501         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
502         if mobj is not None:
503             video_uploader_id = mobj.group(1)
504         else:
505             self._downloader.report_warning(u'unable to extract uploader nickname')
506
507         # title
508         if 'title' not in video_info:
509             self._downloader.report_error(u'unable to extract video title')
510             return
511         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
512
513         # thumbnail image
514         if 'thumbnail_url' not in video_info:
515             self._downloader.report_warning(u'unable to extract video thumbnail')
516             video_thumbnail = ''
517         else:   # don't panic if we can't find it
518             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
519
520         # upload date
521         upload_date = None
522         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
523         if mobj is not None:
524             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
525             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
526             for expression in format_expressions:
527                 try:
528                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
529                 except:
530                     pass
531
532         # description
533         video_description = get_element_by_id("eow-description", video_webpage)
534         if video_description:
535             video_description = clean_html(video_description)
536         else:
537             video_description = ''
538
539         # subtitles
540         video_subtitles = None
541
542         if self._downloader.params.get('writesubtitles', False):
543             video_subtitles = self._extract_subtitle(video_id)
544             if video_subtitles:
545                 (sub_error, sub_lang, sub) = video_subtitles[0]
546                 if sub_error:
547                     self._downloader.report_error(sub_error)
548
549         if self._downloader.params.get('allsubtitles', False):
550             video_subtitles = self._extract_all_subtitles(video_id)
551             for video_subtitle in video_subtitles:
552                 (sub_error, sub_lang, sub) = video_subtitle
553                 if sub_error:
554                     self._downloader.report_error(sub_error)
555
556         if self._downloader.params.get('listsubtitles', False):
557             sub_lang_list = self._list_available_subtitles(video_id)
558             return
559
560         if 'length_seconds' not in video_info:
561             self._downloader.report_warning(u'unable to extract video duration')
562             video_duration = ''
563         else:
564             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
565
566         # token
567         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
568
569         # Decide which formats to download
570         req_format = self._downloader.params.get('format', None)
571
572         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
573             self.report_rtmp_download()
574             video_url_list = [(None, video_info['conn'][0])]
575         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
576             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
577             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
578             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
579             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
580
581             format_limit = self._downloader.params.get('format_limit', None)
582             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
583             if format_limit is not None and format_limit in available_formats:
584                 format_list = available_formats[available_formats.index(format_limit):]
585             else:
586                 format_list = available_formats
587             existing_formats = [x for x in format_list if x in url_map]
588             if len(existing_formats) == 0:
589                 self._downloader.report_error(u'no known formats available for video')
590                 return
591             if self._downloader.params.get('listformats', None):
592                 self._print_formats(existing_formats)
593                 return
594             if req_format is None or req_format == 'best':
595                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
596             elif req_format == 'worst':
597                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
598             elif req_format in ('-1', 'all'):
599                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
600             else:
601                 # Specific formats. We pick the first in a slash-delimeted sequence.
602                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
603                 req_formats = req_format.split('/')
604                 video_url_list = None
605                 for rf in req_formats:
606                     if rf in url_map:
607                         video_url_list = [(rf, url_map[rf])]
608                         break
609                 if video_url_list is None:
610                     self._downloader.report_error(u'requested format not available')
611                     return
612         else:
613             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
614             return
615
616         results = []
617         for format_param, video_real_url in video_url_list:
618             # Extension
619             video_extension = self._video_extensions.get(format_param, 'flv')
620
621             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
622                                               self._video_dimensions.get(format_param, '???'))
623
624             results.append({
625                 'id':       video_id,
626                 'url':      video_real_url,
627                 'uploader': video_uploader,
628                 'uploader_id': video_uploader_id,
629                 'upload_date':  upload_date,
630                 'title':    video_title,
631                 'ext':      video_extension,
632                 'format':   video_format,
633                 'thumbnail':    video_thumbnail,
634                 'description':  video_description,
635                 'player_url':   player_url,
636                 'subtitles':    video_subtitles,
637                 'duration':     video_duration
638             })
639         return results
640
641
642 class MetacafeIE(InfoExtractor):
643     """Information Extractor for metacafe.com."""
644
645     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
646     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
647     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
648     IE_NAME = u'metacafe'
649
650     def __init__(self, downloader=None):
651         InfoExtractor.__init__(self, downloader)
652
653     def report_disclaimer(self):
654         """Report disclaimer retrieval."""
655         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
656
657     def report_age_confirmation(self):
658         """Report attempt to confirm age."""
659         self._downloader.to_screen(u'[metacafe] Confirming age')
660
661     def report_download_webpage(self, video_id):
662         """Report webpage download."""
663         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
664
665     def report_extraction(self, video_id):
666         """Report information extraction."""
667         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
668
669     def _real_initialize(self):
670         # Retrieve disclaimer
671         request = compat_urllib_request.Request(self._DISCLAIMER)
672         try:
673             self.report_disclaimer()
674             disclaimer = compat_urllib_request.urlopen(request).read()
675         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
676             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
677             return
678
679         # Confirm age
680         disclaimer_form = {
681             'filters': '0',
682             'submit': "Continue - I'm over 18",
683             }
684         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
685         try:
686             self.report_age_confirmation()
687             disclaimer = compat_urllib_request.urlopen(request).read()
688         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
689             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
690             return
691
692     def _real_extract(self, url):
693         # Extract id and simplified title from URL
694         mobj = re.match(self._VALID_URL, url)
695         if mobj is None:
696             self._downloader.report_error(u'invalid URL: %s' % url)
697             return
698
699         video_id = mobj.group(1)
700
701         # Check if video comes from YouTube
702         mobj2 = re.match(r'^yt-(.*)$', video_id)
703         if mobj2 is not None:
704             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
705             return
706
707         # Retrieve video webpage to extract further information
708         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
709         try:
710             self.report_download_webpage(video_id)
711             webpage = compat_urllib_request.urlopen(request).read()
712         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
713             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
714             return
715
716         # Extract URL, uploader and title from webpage
717         self.report_extraction(video_id)
718         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
719         if mobj is not None:
720             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
721             video_extension = mediaURL[-3:]
722
723             # Extract gdaKey if available
724             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
725             if mobj is None:
726                 video_url = mediaURL
727             else:
728                 gdaKey = mobj.group(1)
729                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
730         else:
731             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
732             if mobj is None:
733                 self._downloader.report_error(u'unable to extract media URL')
734                 return
735             vardict = compat_parse_qs(mobj.group(1))
736             if 'mediaData' not in vardict:
737                 self._downloader.report_error(u'unable to extract media URL')
738                 return
739             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
740             if mobj is None:
741                 self._downloader.report_error(u'unable to extract media URL')
742                 return
743             mediaURL = mobj.group(1).replace('\\/', '/')
744             video_extension = mediaURL[-3:]
745             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
746
747         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
748         if mobj is None:
749             self._downloader.report_error(u'unable to extract title')
750             return
751         video_title = mobj.group(1).decode('utf-8')
752
753         mobj = re.search(r'submitter=(.*?);', webpage)
754         if mobj is None:
755             self._downloader.report_error(u'unable to extract uploader nickname')
756             return
757         video_uploader = mobj.group(1)
758
759         return [{
760             'id':       video_id.decode('utf-8'),
761             'url':      video_url.decode('utf-8'),
762             'uploader': video_uploader.decode('utf-8'),
763             'upload_date':  None,
764             'title':    video_title,
765             'ext':      video_extension.decode('utf-8'),
766         }]
767
768
769 class DailymotionIE(InfoExtractor):
770     """Information Extractor for Dailymotion"""
771
772     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
773     IE_NAME = u'dailymotion'
774     _WORKING = False
775
776     def __init__(self, downloader=None):
777         InfoExtractor.__init__(self, downloader)
778
779     def report_extraction(self, video_id):
780         """Report information extraction."""
781         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
782
783     def _real_extract(self, url):
784         # Extract id and simplified title from URL
785         mobj = re.match(self._VALID_URL, url)
786         if mobj is None:
787             self._downloader.report_error(u'invalid URL: %s' % url)
788             return
789
790         video_id = mobj.group(1).split('_')[0].split('?')[0]
791
792         video_extension = 'mp4'
793
794         # Retrieve video webpage to extract further information
795         request = compat_urllib_request.Request(url)
796         request.add_header('Cookie', 'family_filter=off')
797         webpage = self._download_webpage(request, video_id)
798
799         # Extract URL, uploader and title from webpage
800         self.report_extraction(video_id)
801         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
802         if mobj is None:
803             self._downloader.report_error(u'unable to extract media URL')
804             return
805         flashvars = compat_urllib_parse.unquote(mobj.group(1))
806
807         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
808             if key in flashvars:
809                 max_quality = key
810                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
811                 break
812         else:
813             self._downloader.report_error(u'unable to extract video URL')
814             return
815
816         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
817         if mobj is None:
818             self._downloader.report_error(u'unable to extract video URL')
819             return
820
821         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
822
823         # TODO: support choosing qualities
824
825         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
826         if mobj is None:
827             self._downloader.report_error(u'unable to extract title')
828             return
829         video_title = unescapeHTML(mobj.group('title'))
830
831         video_uploader = None
832         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
833         if mobj is None:
834             # lookin for official user
835             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
836             if mobj_official is None:
837                 self._downloader.report_warning(u'unable to extract uploader nickname')
838             else:
839                 video_uploader = mobj_official.group(1)
840         else:
841             video_uploader = mobj.group(1)
842
843         video_upload_date = None
844         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
845         if mobj is not None:
846             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
847
848         return [{
849             'id':       video_id,
850             'url':      video_url,
851             'uploader': video_uploader,
852             'upload_date':  video_upload_date,
853             'title':    video_title,
854             'ext':      video_extension,
855         }]
856
857
858 class PhotobucketIE(InfoExtractor):
859     """Information extractor for photobucket.com."""
860
861     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
862     IE_NAME = u'photobucket'
863
864     def __init__(self, downloader=None):
865         InfoExtractor.__init__(self, downloader)
866
867     def report_download_webpage(self, video_id):
868         """Report webpage download."""
869         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
870
871     def report_extraction(self, video_id):
872         """Report information extraction."""
873         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
874
875     def _real_extract(self, url):
876         # Extract id from URL
877         mobj = re.match(self._VALID_URL, url)
878         if mobj is None:
879             self._downloader.report_error(u'Invalid URL: %s' % url)
880             return
881
882         video_id = mobj.group(1)
883
884         video_extension = 'flv'
885
886         # Retrieve video webpage to extract further information
887         request = compat_urllib_request.Request(url)
888         try:
889             self.report_download_webpage(video_id)
890             webpage = compat_urllib_request.urlopen(request).read()
891         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
892             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
893             return
894
895         # Extract URL, uploader, and title from webpage
896         self.report_extraction(video_id)
897         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
898         if mobj is None:
899             self._downloader.report_error(u'unable to extract media URL')
900             return
901         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
902
903         video_url = mediaURL
904
905         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
906         if mobj is None:
907             self._downloader.report_error(u'unable to extract title')
908             return
909         video_title = mobj.group(1).decode('utf-8')
910
911         video_uploader = mobj.group(2).decode('utf-8')
912
913         return [{
914             'id':       video_id.decode('utf-8'),
915             'url':      video_url.decode('utf-8'),
916             'uploader': video_uploader,
917             'upload_date':  None,
918             'title':    video_title,
919             'ext':      video_extension.decode('utf-8'),
920         }]
921
922
923 class YahooIE(InfoExtractor):
924     """Information extractor for video.yahoo.com."""
925
926     _WORKING = False
927     # _VALID_URL matches all Yahoo! Video URLs
928     # _VPAGE_URL matches only the extractable '/watch/' URLs
929     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
930     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
931     IE_NAME = u'video.yahoo'
932
933     def __init__(self, downloader=None):
934         InfoExtractor.__init__(self, downloader)
935
936     def report_download_webpage(self, video_id):
937         """Report webpage download."""
938         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
939
940     def report_extraction(self, video_id):
941         """Report information extraction."""
942         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
943
944     def _real_extract(self, url, new_video=True):
945         # Extract ID from URL
946         mobj = re.match(self._VALID_URL, url)
947         if mobj is None:
948             self._downloader.report_error(u'Invalid URL: %s' % url)
949             return
950
951         video_id = mobj.group(2)
952         video_extension = 'flv'
953
954         # Rewrite valid but non-extractable URLs as
955         # extractable English language /watch/ URLs
956         if re.match(self._VPAGE_URL, url) is None:
957             request = compat_urllib_request.Request(url)
958             try:
959                 webpage = compat_urllib_request.urlopen(request).read()
960             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
961                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
962                 return
963
964             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
965             if mobj is None:
966                 self._downloader.report_error(u'Unable to extract id field')
967                 return
968             yahoo_id = mobj.group(1)
969
970             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
971             if mobj is None:
972                 self._downloader.report_error(u'Unable to extract vid field')
973                 return
974             yahoo_vid = mobj.group(1)
975
976             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
977             return self._real_extract(url, new_video=False)
978
979         # Retrieve video webpage to extract further information
980         request = compat_urllib_request.Request(url)
981         try:
982             self.report_download_webpage(video_id)
983             webpage = compat_urllib_request.urlopen(request).read()
984         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
985             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
986             return
987
988         # Extract uploader and title from webpage
989         self.report_extraction(video_id)
990         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
991         if mobj is None:
992             self._downloader.report_error(u'unable to extract video title')
993             return
994         video_title = mobj.group(1).decode('utf-8')
995
996         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
997         if mobj is None:
998             self._downloader.report_error(u'unable to extract video uploader')
999             return
1000         video_uploader = mobj.group(1).decode('utf-8')
1001
1002         # Extract video thumbnail
1003         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1004         if mobj is None:
1005             self._downloader.report_error(u'unable to extract video thumbnail')
1006             return
1007         video_thumbnail = mobj.group(1).decode('utf-8')
1008
1009         # Extract video description
1010         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1011         if mobj is None:
1012             self._downloader.report_error(u'unable to extract video description')
1013             return
1014         video_description = mobj.group(1).decode('utf-8')
1015         if not video_description:
1016             video_description = 'No description available.'
1017
1018         # Extract video height and width
1019         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1020         if mobj is None:
1021             self._downloader.report_error(u'unable to extract video height')
1022             return
1023         yv_video_height = mobj.group(1)
1024
1025         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1026         if mobj is None:
1027             self._downloader.report_error(u'unable to extract video width')
1028             return
1029         yv_video_width = mobj.group(1)
1030
1031         # Retrieve video playlist to extract media URL
1032         # I'm not completely sure what all these options are, but we
1033         # seem to need most of them, otherwise the server sends a 401.
1034         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1035         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1036         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1037                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1038                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1039         try:
1040             self.report_download_webpage(video_id)
1041             webpage = compat_urllib_request.urlopen(request).read()
1042         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1043             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1044             return
1045
1046         # Extract media URL from playlist XML
1047         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1048         if mobj is None:
1049             self._downloader.report_error(u'Unable to extract media URL')
1050             return
1051         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1052         video_url = unescapeHTML(video_url)
1053
1054         return [{
1055             'id':       video_id.decode('utf-8'),
1056             'url':      video_url,
1057             'uploader': video_uploader,
1058             'upload_date':  None,
1059             'title':    video_title,
1060             'ext':      video_extension.decode('utf-8'),
1061             'thumbnail':    video_thumbnail.decode('utf-8'),
1062             'description':  video_description,
1063         }]
1064
1065
1066 class VimeoIE(InfoExtractor):
1067     """Information extractor for vimeo.com."""
1068
1069     # _VALID_URL matches Vimeo URLs
1070     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1071     IE_NAME = u'vimeo'
1072
1073     def __init__(self, downloader=None):
1074         InfoExtractor.__init__(self, downloader)
1075
1076     def report_download_webpage(self, video_id):
1077         """Report webpage download."""
1078         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1079
1080     def report_extraction(self, video_id):
1081         """Report information extraction."""
1082         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1083
1084     def _real_extract(self, url, new_video=True):
1085         # Extract ID from URL
1086         mobj = re.match(self._VALID_URL, url)
1087         if mobj is None:
1088             self._downloader.report_error(u'Invalid URL: %s' % url)
1089             return
1090
1091         video_id = mobj.group('id')
1092         if not mobj.group('proto'):
1093             url = 'https://' + url
1094         if mobj.group('direct_link'):
1095             url = 'https://vimeo.com/' + video_id
1096
1097         # Retrieve video webpage to extract further information
1098         request = compat_urllib_request.Request(url, None, std_headers)
1099         try:
1100             self.report_download_webpage(video_id)
1101             webpage_bytes = compat_urllib_request.urlopen(request).read()
1102             webpage = webpage_bytes.decode('utf-8')
1103         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1104             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1105             return
1106
1107         # Now we begin extracting as much information as we can from what we
1108         # retrieved. First we extract the information common to all extractors,
1109         # and latter we extract those that are Vimeo specific.
1110         self.report_extraction(video_id)
1111
1112         # Extract the config JSON
1113         try:
1114             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1115             config = json.loads(config)
1116         except:
1117             self._downloader.report_error(u'unable to extract info section')
1118             return
1119
1120         # Extract title
1121         video_title = config["video"]["title"]
1122
1123         # Extract uploader and uploader_id
1124         video_uploader = config["video"]["owner"]["name"]
1125         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1126
1127         # Extract video thumbnail
1128         video_thumbnail = config["video"]["thumbnail"]
1129
1130         # Extract video description
1131         video_description = get_element_by_attribute("itemprop", "description", webpage)
1132         if video_description: video_description = clean_html(video_description)
1133         else: video_description = ''
1134
1135         # Extract upload date
1136         video_upload_date = None
1137         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1138         if mobj is not None:
1139             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1140
1141         # Vimeo specific: extract request signature and timestamp
1142         sig = config['request']['signature']
1143         timestamp = config['request']['timestamp']
1144
1145         # Vimeo specific: extract video codec and quality information
1146         # First consider quality, then codecs, then take everything
1147         # TODO bind to format param
1148         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1149         files = { 'hd': [], 'sd': [], 'other': []}
1150         for codec_name, codec_extension in codecs:
1151             if codec_name in config["video"]["files"]:
1152                 if 'hd' in config["video"]["files"][codec_name]:
1153                     files['hd'].append((codec_name, codec_extension, 'hd'))
1154                 elif 'sd' in config["video"]["files"][codec_name]:
1155                     files['sd'].append((codec_name, codec_extension, 'sd'))
1156                 else:
1157                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1158
1159         for quality in ('hd', 'sd', 'other'):
1160             if len(files[quality]) > 0:
1161                 video_quality = files[quality][0][2]
1162                 video_codec = files[quality][0][0]
1163                 video_extension = files[quality][0][1]
1164                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1165                 break
1166         else:
1167             self._downloader.report_error(u'no known codec found')
1168             return
1169
1170         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1171                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1172
1173         return [{
1174             'id':       video_id,
1175             'url':      video_url,
1176             'uploader': video_uploader,
1177             'uploader_id': video_uploader_id,
1178             'upload_date':  video_upload_date,
1179             'title':    video_title,
1180             'ext':      video_extension,
1181             'thumbnail':    video_thumbnail,
1182             'description':  video_description,
1183         }]
1184
1185
1186 class ArteTvIE(InfoExtractor):
1187     """arte.tv information extractor."""
1188
1189     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1190     _LIVE_URL = r'index-[0-9]+\.html$'
1191
1192     IE_NAME = u'arte.tv'
1193
1194     def __init__(self, downloader=None):
1195         InfoExtractor.__init__(self, downloader)
1196
1197     def report_download_webpage(self, video_id):
1198         """Report webpage download."""
1199         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1200
1201     def report_extraction(self, video_id):
1202         """Report information extraction."""
1203         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1204
1205     def fetch_webpage(self, url):
1206         request = compat_urllib_request.Request(url)
1207         try:
1208             self.report_download_webpage(url)
1209             webpage = compat_urllib_request.urlopen(request).read()
1210         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1211             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1212             return
1213         except ValueError as err:
1214             self._downloader.report_error(u'Invalid URL: %s' % url)
1215             return
1216         return webpage
1217
1218     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1219         page = self.fetch_webpage(url)
1220         mobj = re.search(regex, page, regexFlags)
1221         info = {}
1222
1223         if mobj is None:
1224             self._downloader.report_error(u'Invalid URL: %s' % url)
1225             return
1226
1227         for (i, key, err) in matchTuples:
1228             if mobj.group(i) is None:
1229                 self._downloader.trouble(err)
1230                 return
1231             else:
1232                 info[key] = mobj.group(i)
1233
1234         return info
1235
1236     def extractLiveStream(self, url):
1237         video_lang = url.split('/')[-4]
1238         info = self.grep_webpage(
1239             url,
1240             r'src="(.*?/videothek_js.*?\.js)',
1241             0,
1242             [
1243                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1244             ]
1245         )
1246         http_host = url.split('/')[2]
1247         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1248         info = self.grep_webpage(
1249             next_url,
1250             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1251                 '(http://.*?\.swf).*?' +
1252                 '(rtmp://.*?)\'',
1253             re.DOTALL,
1254             [
1255                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1256                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1257                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1258             ]
1259         )
1260         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1261
1262     def extractPlus7Stream(self, url):
1263         video_lang = url.split('/')[-3]
1264         info = self.grep_webpage(
1265             url,
1266             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1267             0,
1268             [
1269                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1270             ]
1271         )
1272         next_url = compat_urllib_parse.unquote(info.get('url'))
1273         info = self.grep_webpage(
1274             next_url,
1275             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1276             0,
1277             [
1278                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1279             ]
1280         )
1281         next_url = compat_urllib_parse.unquote(info.get('url'))
1282
1283         info = self.grep_webpage(
1284             next_url,
1285             r'<video id="(.*?)".*?>.*?' +
1286                 '<name>(.*?)</name>.*?' +
1287                 '<dateVideo>(.*?)</dateVideo>.*?' +
1288                 '<url quality="hd">(.*?)</url>',
1289             re.DOTALL,
1290             [
1291                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1292                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1293                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1294                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1295             ]
1296         )
1297
1298         return {
1299             'id':           info.get('id'),
1300             'url':          compat_urllib_parse.unquote(info.get('url')),
1301             'uploader':     u'arte.tv',
1302             'upload_date':  info.get('date'),
1303             'title':        info.get('title').decode('utf-8'),
1304             'ext':          u'mp4',
1305             'format':       u'NA',
1306             'player_url':   None,
1307         }
1308
1309     def _real_extract(self, url):
1310         video_id = url.split('/')[-1]
1311         self.report_extraction(video_id)
1312
1313         if re.search(self._LIVE_URL, video_id) is not None:
1314             self.extractLiveStream(url)
1315             return
1316         else:
1317             info = self.extractPlus7Stream(url)
1318
1319         return [info]
1320
1321
1322 class GenericIE(InfoExtractor):
1323     """Generic last-resort information extractor."""
1324
1325     _VALID_URL = r'.*'
1326     IE_NAME = u'generic'
1327
1328     def __init__(self, downloader=None):
1329         InfoExtractor.__init__(self, downloader)
1330
1331     def report_download_webpage(self, video_id):
1332         """Report webpage download."""
1333         if not self._downloader.params.get('test', False):
1334             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1335         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1336
1337     def report_extraction(self, video_id):
1338         """Report information extraction."""
1339         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1340
1341     def report_following_redirect(self, new_url):
1342         """Report information extraction."""
1343         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1344
1345     def _test_redirect(self, url):
1346         """Check if it is a redirect, like url shorteners, in case restart chain."""
1347         class HeadRequest(compat_urllib_request.Request):
1348             def get_method(self):
1349                 return "HEAD"
1350
1351         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1352             """
1353             Subclass the HTTPRedirectHandler to make it use our
1354             HeadRequest also on the redirected URL
1355             """
1356             def redirect_request(self, req, fp, code, msg, headers, newurl):
1357                 if code in (301, 302, 303, 307):
1358                     newurl = newurl.replace(' ', '%20')
1359                     newheaders = dict((k,v) for k,v in req.headers.items()
1360                                       if k.lower() not in ("content-length", "content-type"))
1361                     return HeadRequest(newurl,
1362                                        headers=newheaders,
1363                                        origin_req_host=req.get_origin_req_host(),
1364                                        unverifiable=True)
1365                 else:
1366                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1367
1368         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1369             """
1370             Fallback to GET if HEAD is not allowed (405 HTTP error)
1371             """
1372             def http_error_405(self, req, fp, code, msg, headers):
1373                 fp.read()
1374                 fp.close()
1375
1376                 newheaders = dict((k,v) for k,v in req.headers.items()
1377                                   if k.lower() not in ("content-length", "content-type"))
1378                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1379                                                  headers=newheaders,
1380                                                  origin_req_host=req.get_origin_req_host(),
1381                                                  unverifiable=True))
1382
1383         # Build our opener
1384         opener = compat_urllib_request.OpenerDirector()
1385         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1386                         HTTPMethodFallback, HEADRedirectHandler,
1387                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1388             opener.add_handler(handler())
1389
1390         response = opener.open(HeadRequest(url))
1391         new_url = response.geturl()
1392
1393         if url == new_url:
1394             return False
1395
1396         self.report_following_redirect(new_url)
1397         self._downloader.download([new_url])
1398         return True
1399
1400     def _real_extract(self, url):
1401         if self._test_redirect(url): return
1402
1403         video_id = url.split('/')[-1]
1404         try:
1405             webpage = self._download_webpage(url, video_id)
1406         except ValueError as err:
1407             # since this is the last-resort InfoExtractor, if
1408             # this error is thrown, it'll be thrown here
1409             self._downloader.report_error(u'Invalid URL: %s' % url)
1410             return
1411
1412         self.report_extraction(video_id)
1413         # Start with something easy: JW Player in SWFObject
1414         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1415         if mobj is None:
1416             # Broaden the search a little bit
1417             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1418         if mobj is None:
1419             # Broaden the search a little bit: JWPlayer JS loader
1420             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1421         if mobj is None:
1422             self._downloader.report_error(u'Invalid URL: %s' % url)
1423             return
1424
1425         # It's possible that one of the regexes
1426         # matched, but returned an empty group:
1427         if mobj.group(1) is None:
1428             self._downloader.report_error(u'Invalid URL: %s' % url)
1429             return
1430
1431         video_url = compat_urllib_parse.unquote(mobj.group(1))
1432         video_id = os.path.basename(video_url)
1433
1434         # here's a fun little line of code for you:
1435         video_extension = os.path.splitext(video_id)[1][1:]
1436         video_id = os.path.splitext(video_id)[0]
1437
1438         # it's tempting to parse this further, but you would
1439         # have to take into account all the variations like
1440         #   Video Title - Site Name
1441         #   Site Name | Video Title
1442         #   Video Title - Tagline | Site Name
1443         # and so on and so forth; it's just not practical
1444         mobj = re.search(r'<title>(.*)</title>', webpage)
1445         if mobj is None:
1446             self._downloader.report_error(u'unable to extract title')
1447             return
1448         video_title = mobj.group(1)
1449
1450         # video uploader is domain name
1451         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1452         if mobj is None:
1453             self._downloader.report_error(u'unable to extract title')
1454             return
1455         video_uploader = mobj.group(1)
1456
1457         return [{
1458             'id':       video_id,
1459             'url':      video_url,
1460             'uploader': video_uploader,
1461             'upload_date':  None,
1462             'title':    video_title,
1463             'ext':      video_extension,
1464         }]
1465
1466
1467 class YoutubeSearchIE(InfoExtractor):
1468     """Information Extractor for YouTube search queries."""
1469     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1470     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1471     _max_youtube_results = 1000
1472     IE_NAME = u'youtube:search'
1473
1474     def __init__(self, downloader=None):
1475         InfoExtractor.__init__(self, downloader)
1476
1477     def report_download_page(self, query, pagenum):
1478         """Report attempt to download search page with given number."""
1479         query = query.decode(preferredencoding())
1480         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1481
1482     def _real_extract(self, query):
1483         mobj = re.match(self._VALID_URL, query)
1484         if mobj is None:
1485             self._downloader.report_error(u'invalid search query "%s"' % query)
1486             return
1487
1488         prefix, query = query.split(':')
1489         prefix = prefix[8:]
1490         query = query.encode('utf-8')
1491         if prefix == '':
1492             self._download_n_results(query, 1)
1493             return
1494         elif prefix == 'all':
1495             self._download_n_results(query, self._max_youtube_results)
1496             return
1497         else:
1498             try:
1499                 n = int(prefix)
1500                 if n <= 0:
1501                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1502                     return
1503                 elif n > self._max_youtube_results:
1504                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1505                     n = self._max_youtube_results
1506                 self._download_n_results(query, n)
1507                 return
1508             except ValueError: # parsing prefix as integer fails
1509                 self._download_n_results(query, 1)
1510                 return
1511
1512     def _download_n_results(self, query, n):
1513         """Downloads a specified number of results for a query"""
1514
1515         video_ids = []
1516         pagenum = 0
1517         limit = n
1518
1519         while (50 * pagenum) < limit:
1520             self.report_download_page(query, pagenum+1)
1521             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1522             request = compat_urllib_request.Request(result_url)
1523             try:
1524                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1525             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1526                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1527                 return
1528             api_response = json.loads(data)['data']
1529
1530             if not 'items' in api_response:
1531                 self._downloader.trouble(u'[youtube] No video results')
1532                 return
1533
1534             new_ids = list(video['id'] for video in api_response['items'])
1535             video_ids += new_ids
1536
1537             limit = min(n, api_response['totalItems'])
1538             pagenum += 1
1539
1540         if len(video_ids) > n:
1541             video_ids = video_ids[:n]
1542         for id in video_ids:
1543             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1544         return
1545
1546
1547 class GoogleSearchIE(InfoExtractor):
1548     """Information Extractor for Google Video search queries."""
1549     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1550     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1551     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1552     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1553     _max_google_results = 1000
1554     IE_NAME = u'video.google:search'
1555
1556     def __init__(self, downloader=None):
1557         InfoExtractor.__init__(self, downloader)
1558
1559     def report_download_page(self, query, pagenum):
1560         """Report attempt to download playlist page with given number."""
1561         query = query.decode(preferredencoding())
1562         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1563
1564     def _real_extract(self, query):
1565         mobj = re.match(self._VALID_URL, query)
1566         if mobj is None:
1567             self._downloader.report_error(u'invalid search query "%s"' % query)
1568             return
1569
1570         prefix, query = query.split(':')
1571         prefix = prefix[8:]
1572         query = query.encode('utf-8')
1573         if prefix == '':
1574             self._download_n_results(query, 1)
1575             return
1576         elif prefix == 'all':
1577             self._download_n_results(query, self._max_google_results)
1578             return
1579         else:
1580             try:
1581                 n = int(prefix)
1582                 if n <= 0:
1583                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1584                     return
1585                 elif n > self._max_google_results:
1586                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1587                     n = self._max_google_results
1588                 self._download_n_results(query, n)
1589                 return
1590             except ValueError: # parsing prefix as integer fails
1591                 self._download_n_results(query, 1)
1592                 return
1593
1594     def _download_n_results(self, query, n):
1595         """Downloads a specified number of results for a query"""
1596
1597         video_ids = []
1598         pagenum = 0
1599
1600         while True:
1601             self.report_download_page(query, pagenum)
1602             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1603             request = compat_urllib_request.Request(result_url)
1604             try:
1605                 page = compat_urllib_request.urlopen(request).read()
1606             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1607                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1608                 return
1609
1610             # Extract video identifiers
1611             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1612                 video_id = mobj.group(1)
1613                 if video_id not in video_ids:
1614                     video_ids.append(video_id)
1615                     if len(video_ids) == n:
1616                         # Specified n videos reached
1617                         for id in video_ids:
1618                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1619                         return
1620
1621             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1622                 for id in video_ids:
1623                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624                 return
1625
1626             pagenum = pagenum + 1
1627
1628
1629 class YahooSearchIE(InfoExtractor):
1630     """Information Extractor for Yahoo! Video search queries."""
1631
1632     _WORKING = False
1633     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1634     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1635     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1636     _MORE_PAGES_INDICATOR = r'\s*Next'
1637     _max_yahoo_results = 1000
1638     IE_NAME = u'video.yahoo:search'
1639
1640     def __init__(self, downloader=None):
1641         InfoExtractor.__init__(self, downloader)
1642
1643     def report_download_page(self, query, pagenum):
1644         """Report attempt to download playlist page with given number."""
1645         query = query.decode(preferredencoding())
1646         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1647
1648     def _real_extract(self, query):
1649         mobj = re.match(self._VALID_URL, query)
1650         if mobj is None:
1651             self._downloader.report_error(u'invalid search query "%s"' % query)
1652             return
1653
1654         prefix, query = query.split(':')
1655         prefix = prefix[8:]
1656         query = query.encode('utf-8')
1657         if prefix == '':
1658             self._download_n_results(query, 1)
1659             return
1660         elif prefix == 'all':
1661             self._download_n_results(query, self._max_yahoo_results)
1662             return
1663         else:
1664             try:
1665                 n = int(prefix)
1666                 if n <= 0:
1667                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1668                     return
1669                 elif n > self._max_yahoo_results:
1670                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1671                     n = self._max_yahoo_results
1672                 self._download_n_results(query, n)
1673                 return
1674             except ValueError: # parsing prefix as integer fails
1675                 self._download_n_results(query, 1)
1676                 return
1677
1678     def _download_n_results(self, query, n):
1679         """Downloads a specified number of results for a query"""
1680
1681         video_ids = []
1682         already_seen = set()
1683         pagenum = 1
1684
1685         while True:
1686             self.report_download_page(query, pagenum)
1687             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1688             request = compat_urllib_request.Request(result_url)
1689             try:
1690                 page = compat_urllib_request.urlopen(request).read()
1691             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1692                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1693                 return
1694
1695             # Extract video identifiers
1696             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697                 video_id = mobj.group(1)
1698                 if video_id not in already_seen:
1699                     video_ids.append(video_id)
1700                     already_seen.add(video_id)
1701                     if len(video_ids) == n:
1702                         # Specified n videos reached
1703                         for id in video_ids:
1704                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1705                         return
1706
1707             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1708                 for id in video_ids:
1709                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710                 return
1711
1712             pagenum = pagenum + 1
1713
1714
1715 class YoutubePlaylistIE(InfoExtractor):
1716     """Information Extractor for YouTube playlists."""
1717
1718     _VALID_URL = r"""(?:
1719                         (?:https?://)?
1720                         (?:\w+\.)?
1721                         youtube\.com/
1722                         (?:
1723                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1724                            \? (?:.*?&)*? (?:p|a|list)=
1725                         |  user/.*?/user/
1726                         |  p/
1727                         |  user/.*?#[pg]/c/
1728                         )
1729                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1730                         .*
1731                      |
1732                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1733                      )"""
1734     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1735     _MAX_RESULTS = 50
1736     IE_NAME = u'youtube:playlist'
1737
1738     def __init__(self, downloader=None):
1739         InfoExtractor.__init__(self, downloader)
1740
1741     @classmethod
1742     def suitable(cls, url):
1743         """Receives a URL and returns True if suitable for this IE."""
1744         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1745
1746     def report_download_page(self, playlist_id, pagenum):
1747         """Report attempt to download playlist page with given number."""
1748         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1749
1750     def _real_extract(self, url):
1751         # Extract playlist id
1752         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1753         if mobj is None:
1754             self._downloader.report_error(u'invalid url: %s' % url)
1755             return
1756
1757         # Download playlist videos from API
1758         playlist_id = mobj.group(1) or mobj.group(2)
1759         page_num = 1
1760         videos = []
1761
1762         while True:
1763             self.report_download_page(playlist_id, page_num)
1764
1765             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1766             try:
1767                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1768             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1770                 return
1771
1772             try:
1773                 response = json.loads(page)
1774             except ValueError as err:
1775                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1776                 return
1777
1778             if not 'feed' in response or not 'entry' in response['feed']:
1779                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1780                 return
1781             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1782                         for entry in response['feed']['entry']
1783                         if 'content' in entry ]
1784
1785             if len(response['feed']['entry']) < self._MAX_RESULTS:
1786                 break
1787             page_num += 1
1788
1789         videos = [v[1] for v in sorted(videos)]
1790         total = len(videos)
1791
1792         playliststart = self._downloader.params.get('playliststart', 1) - 1
1793         playlistend = self._downloader.params.get('playlistend', -1)
1794         if playlistend == -1:
1795             videos = videos[playliststart:]
1796         else:
1797             videos = videos[playliststart:playlistend]
1798
1799         if len(videos) == total:
1800             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1801         else:
1802             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1803
1804         for video in videos:
1805             self._downloader.download([video])
1806         return
1807
1808
1809 class YoutubeChannelIE(InfoExtractor):
1810     """Information Extractor for YouTube channels."""
1811
1812     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1813     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1814     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1815     IE_NAME = u'youtube:channel'
1816
1817     def report_download_page(self, channel_id, pagenum):
1818         """Report attempt to download channel page with given number."""
1819         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1820
1821     def _real_extract(self, url):
1822         # Extract channel id
1823         mobj = re.match(self._VALID_URL, url)
1824         if mobj is None:
1825             self._downloader.report_error(u'invalid url: %s' % url)
1826             return
1827
1828         # Download channel pages
1829         channel_id = mobj.group(1)
1830         video_ids = []
1831         pagenum = 1
1832
1833         while True:
1834             self.report_download_page(channel_id, pagenum)
1835             url = self._TEMPLATE_URL % (channel_id, pagenum)
1836             request = compat_urllib_request.Request(url)
1837             try:
1838                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1839             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1840                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1841                 return
1842
1843             # Extract video identifiers
1844             ids_in_page = []
1845             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1846                 if mobj.group(1) not in ids_in_page:
1847                     ids_in_page.append(mobj.group(1))
1848             video_ids.extend(ids_in_page)
1849
1850             if self._MORE_PAGES_INDICATOR not in page:
1851                 break
1852             pagenum = pagenum + 1
1853
1854         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1855
1856         for id in video_ids:
1857             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1858         return
1859
1860
1861 class YoutubeUserIE(InfoExtractor):
1862     """Information Extractor for YouTube users."""
1863
1864     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1865     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1866     _GDATA_PAGE_SIZE = 50
1867     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1868     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1869     IE_NAME = u'youtube:user'
1870
1871     def __init__(self, downloader=None):
1872         InfoExtractor.__init__(self, downloader)
1873
1874     def report_download_page(self, username, start_index):
1875         """Report attempt to download user page."""
1876         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1877                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1878
1879     def _real_extract(self, url):
1880         # Extract username
1881         mobj = re.match(self._VALID_URL, url)
1882         if mobj is None:
1883             self._downloader.report_error(u'invalid url: %s' % url)
1884             return
1885
1886         username = mobj.group(1)
1887
1888         # Download video ids using YouTube Data API. Result size per
1889         # query is limited (currently to 50 videos) so we need to query
1890         # page by page until there are no video ids - it means we got
1891         # all of them.
1892
1893         video_ids = []
1894         pagenum = 0
1895
1896         while True:
1897             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1898             self.report_download_page(username, start_index)
1899
1900             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1901
1902             try:
1903                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1904             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1906                 return
1907
1908             # Extract video identifiers
1909             ids_in_page = []
1910
1911             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1912                 if mobj.group(1) not in ids_in_page:
1913                     ids_in_page.append(mobj.group(1))
1914
1915             video_ids.extend(ids_in_page)
1916
1917             # A little optimization - if current page is not
1918             # "full", ie. does not contain PAGE_SIZE video ids then
1919             # we can assume that this page is the last one - there
1920             # are no more ids on further pages - no need to query
1921             # again.
1922
1923             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1924                 break
1925
1926             pagenum += 1
1927
1928         all_ids_count = len(video_ids)
1929         playliststart = self._downloader.params.get('playliststart', 1) - 1
1930         playlistend = self._downloader.params.get('playlistend', -1)
1931
1932         if playlistend == -1:
1933             video_ids = video_ids[playliststart:]
1934         else:
1935             video_ids = video_ids[playliststart:playlistend]
1936
1937         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1938                 (username, all_ids_count, len(video_ids)))
1939
1940         for video_id in video_ids:
1941             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1942
1943
1944 class BlipTVUserIE(InfoExtractor):
1945     """Information Extractor for blip.tv users."""
1946
1947     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1948     _PAGE_SIZE = 12
1949     IE_NAME = u'blip.tv:user'
1950
1951     def __init__(self, downloader=None):
1952         InfoExtractor.__init__(self, downloader)
1953
1954     def report_download_page(self, username, pagenum):
1955         """Report attempt to download user page."""
1956         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1957                 (self.IE_NAME, username, pagenum))
1958
1959     def _real_extract(self, url):
1960         # Extract username
1961         mobj = re.match(self._VALID_URL, url)
1962         if mobj is None:
1963             self._downloader.report_error(u'invalid url: %s' % url)
1964             return
1965
1966         username = mobj.group(1)
1967
1968         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1969
1970         request = compat_urllib_request.Request(url)
1971
1972         try:
1973             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1974             mobj = re.search(r'data-users-id="([^"]+)"', page)
1975             page_base = page_base % mobj.group(1)
1976         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1978             return
1979
1980
1981         # Download video ids using BlipTV Ajax calls. Result size per
1982         # query is limited (currently to 12 videos) so we need to query
1983         # page by page until there are no video ids - it means we got
1984         # all of them.
1985
1986         video_ids = []
1987         pagenum = 1
1988
1989         while True:
1990             self.report_download_page(username, pagenum)
1991             url = page_base + "&page=" + str(pagenum)
1992             request = compat_urllib_request.Request( url )
1993             try:
1994                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1995             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1997                 return
1998
1999             # Extract video identifiers
2000             ids_in_page = []
2001
2002             for mobj in re.finditer(r'href="/([^"]+)"', page):
2003                 if mobj.group(1) not in ids_in_page:
2004                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2005
2006             video_ids.extend(ids_in_page)
2007
2008             # A little optimization - if current page is not
2009             # "full", ie. does not contain PAGE_SIZE video ids then
2010             # we can assume that this page is the last one - there
2011             # are no more ids on further pages - no need to query
2012             # again.
2013
2014             if len(ids_in_page) < self._PAGE_SIZE:
2015                 break
2016
2017             pagenum += 1
2018
2019         all_ids_count = len(video_ids)
2020         playliststart = self._downloader.params.get('playliststart', 1) - 1
2021         playlistend = self._downloader.params.get('playlistend', -1)
2022
2023         if playlistend == -1:
2024             video_ids = video_ids[playliststart:]
2025         else:
2026             video_ids = video_ids[playliststart:playlistend]
2027
2028         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2029                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2030
2031         for video_id in video_ids:
2032             self._downloader.download([u'http://blip.tv/'+video_id])
2033
2034
2035 class DepositFilesIE(InfoExtractor):
2036     """Information extractor for depositfiles.com"""
2037
2038     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2039
2040     def report_download_webpage(self, file_id):
2041         """Report webpage download."""
2042         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2043
2044     def report_extraction(self, file_id):
2045         """Report information extraction."""
2046         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2047
2048     def _real_extract(self, url):
2049         file_id = url.split('/')[-1]
2050         # Rebuild url in english locale
2051         url = 'http://depositfiles.com/en/files/' + file_id
2052
2053         # Retrieve file webpage with 'Free download' button pressed
2054         free_download_indication = { 'gateway_result' : '1' }
2055         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2056         try:
2057             self.report_download_webpage(file_id)
2058             webpage = compat_urllib_request.urlopen(request).read()
2059         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2060             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2061             return
2062
2063         # Search for the real file URL
2064         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2065         if (mobj is None) or (mobj.group(1) is None):
2066             # Try to figure out reason of the error.
2067             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2068             if (mobj is not None) and (mobj.group(1) is not None):
2069                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2070                 self._downloader.report_error(u'%s' % restriction_message)
2071             else:
2072                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2073             return
2074
2075         file_url = mobj.group(1)
2076         file_extension = os.path.splitext(file_url)[1][1:]
2077
2078         # Search for file title
2079         mobj = re.search(r'<b title="(.*?)">', webpage)
2080         if mobj is None:
2081             self._downloader.report_error(u'unable to extract title')
2082             return
2083         file_title = mobj.group(1).decode('utf-8')
2084
2085         return [{
2086             'id':       file_id.decode('utf-8'),
2087             'url':      file_url.decode('utf-8'),
2088             'uploader': None,
2089             'upload_date':  None,
2090             'title':    file_title,
2091             'ext':      file_extension.decode('utf-8'),
2092         }]
2093
2094
2095 class FacebookIE(InfoExtractor):
2096     """Information Extractor for Facebook"""
2097
2098     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2099     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2100     _NETRC_MACHINE = 'facebook'
2101     IE_NAME = u'facebook'
2102
2103     def report_login(self):
2104         """Report attempt to log in."""
2105         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2106
2107     def _real_initialize(self):
2108         if self._downloader is None:
2109             return
2110
2111         useremail = None
2112         password = None
2113         downloader_params = self._downloader.params
2114
2115         # Attempt to use provided username and password or .netrc data
2116         if downloader_params.get('username', None) is not None:
2117             useremail = downloader_params['username']
2118             password = downloader_params['password']
2119         elif downloader_params.get('usenetrc', False):
2120             try:
2121                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2122                 if info is not None:
2123                     useremail = info[0]
2124                     password = info[2]
2125                 else:
2126                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2127             except (IOError, netrc.NetrcParseError) as err:
2128                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2129                 return
2130
2131         if useremail is None:
2132             return
2133
2134         # Log in
2135         login_form = {
2136             'email': useremail,
2137             'pass': password,
2138             'login': 'Log+In'
2139             }
2140         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2141         try:
2142             self.report_login()
2143             login_results = compat_urllib_request.urlopen(request).read()
2144             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2145                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2146                 return
2147         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2148             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2149             return
2150
2151     def _real_extract(self, url):
2152         mobj = re.match(self._VALID_URL, url)
2153         if mobj is None:
2154             self._downloader.report_error(u'invalid URL: %s' % url)
2155             return
2156         video_id = mobj.group('ID')
2157
2158         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2159         webpage = self._download_webpage(url, video_id)
2160
2161         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2162         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2163         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2164         if not m:
2165             raise ExtractorError(u'Cannot parse data')
2166         data = dict(json.loads(m.group(1)))
2167         params_raw = compat_urllib_parse.unquote(data['params'])
2168         params = json.loads(params_raw)
2169         video_url = params['hd_src']
2170         if not video_url:
2171             video_url = params['sd_src']
2172         if not video_url:
2173             raise ExtractorError(u'Cannot find video URL')
2174         video_duration = int(params['video_duration'])
2175
2176         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2177         if not m:
2178             raise ExtractorError(u'Cannot find title in webpage')
2179         video_title = unescapeHTML(m.group(1))
2180
2181         info = {
2182             'id': video_id,
2183             'title': video_title,
2184             'url': video_url,
2185             'ext': 'mp4',
2186             'duration': video_duration,
2187             'thumbnail': params['thumbnail_src'],
2188         }
2189         return [info]
2190
2191
2192 class BlipTVIE(InfoExtractor):
2193     """Information extractor for blip.tv"""
2194
2195     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2196     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2197     IE_NAME = u'blip.tv'
2198
2199     def report_extraction(self, file_id):
2200         """Report information extraction."""
2201         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2202
2203     def report_direct_download(self, title):
2204         """Report information extraction."""
2205         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2206
2207     def _real_extract(self, url):
2208         mobj = re.match(self._VALID_URL, url)
2209         if mobj is None:
2210             self._downloader.report_error(u'invalid URL: %s' % url)
2211             return
2212
2213         urlp = compat_urllib_parse_urlparse(url)
2214         if urlp.path.startswith('/play/'):
2215             request = compat_urllib_request.Request(url)
2216             response = compat_urllib_request.urlopen(request)
2217             redirecturl = response.geturl()
2218             rurlp = compat_urllib_parse_urlparse(redirecturl)
2219             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2220             url = 'http://blip.tv/a/a-' + file_id
2221             return self._real_extract(url)
2222
2223
2224         if '?' in url:
2225             cchar = '&'
2226         else:
2227             cchar = '?'
2228         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2229         request = compat_urllib_request.Request(json_url)
2230         request.add_header('User-Agent', 'iTunes/10.6.1')
2231         self.report_extraction(mobj.group(1))
2232         info = None
2233         try:
2234             urlh = compat_urllib_request.urlopen(request)
2235             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2236                 basename = url.split('/')[-1]
2237                 title,ext = os.path.splitext(basename)
2238                 title = title.decode('UTF-8')
2239                 ext = ext.replace('.', '')
2240                 self.report_direct_download(title)
2241                 info = {
2242                     'id': title,
2243                     'url': url,
2244                     'uploader': None,
2245                     'upload_date': None,
2246                     'title': title,
2247                     'ext': ext,
2248                     'urlhandle': urlh
2249                 }
2250         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2251             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2252         if info is None: # Regular URL
2253             try:
2254                 json_code_bytes = urlh.read()
2255                 json_code = json_code_bytes.decode('utf-8')
2256             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2257                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2258                 return
2259
2260             try:
2261                 json_data = json.loads(json_code)
2262                 if 'Post' in json_data:
2263                     data = json_data['Post']
2264                 else:
2265                     data = json_data
2266
2267                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2268                 video_url = data['media']['url']
2269                 umobj = re.match(self._URL_EXT, video_url)
2270                 if umobj is None:
2271                     raise ValueError('Can not determine filename extension')
2272                 ext = umobj.group(1)
2273
2274                 info = {
2275                     'id': data['item_id'],
2276                     'url': video_url,
2277                     'uploader': data['display_name'],
2278                     'upload_date': upload_date,
2279                     'title': data['title'],
2280                     'ext': ext,
2281                     'format': data['media']['mimeType'],
2282                     'thumbnail': data['thumbnailUrl'],
2283                     'description': data['description'],
2284                     'player_url': data['embedUrl'],
2285                     'user_agent': 'iTunes/10.6.1',
2286                 }
2287             except (ValueError,KeyError) as err:
2288                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2289                 return
2290
2291         return [info]
2292
2293
2294 class MyVideoIE(InfoExtractor):
2295     """Information Extractor for myvideo.de."""
2296
2297     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2298     IE_NAME = u'myvideo'
2299
2300     def __init__(self, downloader=None):
2301         InfoExtractor.__init__(self, downloader)
2302
2303     def report_extraction(self, video_id):
2304         """Report information extraction."""
2305         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2306
2307     def _real_extract(self,url):
2308         mobj = re.match(self._VALID_URL, url)
2309         if mobj is None:
2310             self._download.report_error(u'invalid URL: %s' % url)
2311             return
2312
2313         video_id = mobj.group(1)
2314
2315         # Get video webpage
2316         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2317         webpage = self._download_webpage(webpage_url, video_id)
2318
2319         self.report_extraction(video_id)
2320         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2321                  webpage)
2322         if mobj is None:
2323             self._downloader.report_error(u'unable to extract media URL')
2324             return
2325         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2326
2327         mobj = re.search('<title>([^<]+)</title>', webpage)
2328         if mobj is None:
2329             self._downloader.report_error(u'unable to extract title')
2330             return
2331
2332         video_title = mobj.group(1)
2333
2334         return [{
2335             'id':       video_id,
2336             'url':      video_url,
2337             'uploader': None,
2338             'upload_date':  None,
2339             'title':    video_title,
2340             'ext':      u'flv',
2341         }]
2342
2343 class ComedyCentralIE(InfoExtractor):
2344     """Information extractor for The Daily Show and Colbert Report """
2345
2346     # urls can be abbreviations like :thedailyshow or :colbert
2347     # urls for episodes like:
2348     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2349     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2350     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2351     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2352                       |(https?://)?(www\.)?
2353                           (?P<showname>thedailyshow|colbertnation)\.com/
2354                          (full-episodes/(?P<episode>.*)|
2355                           (?P<clip>
2356                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2357                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2358                      $"""
2359
2360     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2361
2362     _video_extensions = {
2363         '3500': 'mp4',
2364         '2200': 'mp4',
2365         '1700': 'mp4',
2366         '1200': 'mp4',
2367         '750': 'mp4',
2368         '400': 'mp4',
2369     }
2370     _video_dimensions = {
2371         '3500': '1280x720',
2372         '2200': '960x540',
2373         '1700': '768x432',
2374         '1200': '640x360',
2375         '750': '512x288',
2376         '400': '384x216',
2377     }
2378
2379     @classmethod
2380     def suitable(cls, url):
2381         """Receives a URL and returns True if suitable for this IE."""
2382         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2383
2384     def report_extraction(self, episode_id):
2385         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2386
2387     def report_config_download(self, episode_id, media_id):
2388         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2389
2390     def report_index_download(self, episode_id):
2391         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2392
2393     def _print_formats(self, formats):
2394         print('Available formats:')
2395         for x in formats:
2396             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2397
2398
2399     def _real_extract(self, url):
2400         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2401         if mobj is None:
2402             self._downloader.report_error(u'invalid URL: %s' % url)
2403             return
2404
2405         if mobj.group('shortname'):
2406             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2407                 url = u'http://www.thedailyshow.com/full-episodes/'
2408             else:
2409                 url = u'http://www.colbertnation.com/full-episodes/'
2410             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2411             assert mobj is not None
2412
2413         if mobj.group('clip'):
2414             if mobj.group('showname') == 'thedailyshow':
2415                 epTitle = mobj.group('tdstitle')
2416             else:
2417                 epTitle = mobj.group('cntitle')
2418             dlNewest = False
2419         else:
2420             dlNewest = not mobj.group('episode')
2421             if dlNewest:
2422                 epTitle = mobj.group('showname')
2423             else:
2424                 epTitle = mobj.group('episode')
2425
2426         req = compat_urllib_request.Request(url)
2427         self.report_extraction(epTitle)
2428         try:
2429             htmlHandle = compat_urllib_request.urlopen(req)
2430             html = htmlHandle.read()
2431             webpage = html.decode('utf-8')
2432         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2433             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2434             return
2435         if dlNewest:
2436             url = htmlHandle.geturl()
2437             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2438             if mobj is None:
2439                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2440                 return
2441             if mobj.group('episode') == '':
2442                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2443                 return
2444             epTitle = mobj.group('episode')
2445
2446         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2447
2448         if len(mMovieParams) == 0:
2449             # The Colbert Report embeds the information in a without
2450             # a URL prefix; so extract the alternate reference
2451             # and then add the URL prefix manually.
2452
2453             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2454             if len(altMovieParams) == 0:
2455                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2456                 return
2457             else:
2458                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2459
2460         uri = mMovieParams[0][1]
2461         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2462         self.report_index_download(epTitle)
2463         try:
2464             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2465         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2466             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2467             return
2468
2469         results = []
2470
2471         idoc = xml.etree.ElementTree.fromstring(indexXml)
2472         itemEls = idoc.findall('.//item')
2473         for partNum,itemEl in enumerate(itemEls):
2474             mediaId = itemEl.findall('./guid')[0].text
2475             shortMediaId = mediaId.split(':')[-1]
2476             showId = mediaId.split(':')[-2].replace('.com', '')
2477             officialTitle = itemEl.findall('./title')[0].text
2478             officialDate = itemEl.findall('./pubDate')[0].text
2479
2480             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2481                         compat_urllib_parse.urlencode({'uri': mediaId}))
2482             configReq = compat_urllib_request.Request(configUrl)
2483             self.report_config_download(epTitle, shortMediaId)
2484             try:
2485                 configXml = compat_urllib_request.urlopen(configReq).read()
2486             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2488                 return
2489
2490             cdoc = xml.etree.ElementTree.fromstring(configXml)
2491             turls = []
2492             for rendition in cdoc.findall('.//rendition'):
2493                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2494                 turls.append(finfo)
2495
2496             if len(turls) == 0:
2497                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2498                 continue
2499
2500             if self._downloader.params.get('listformats', None):
2501                 self._print_formats([i[0] for i in turls])
2502                 return
2503
2504             # For now, just pick the highest bitrate
2505             format,rtmp_video_url = turls[-1]
2506
2507             # Get the format arg from the arg stream
2508             req_format = self._downloader.params.get('format', None)
2509
2510             # Select format if we can find one
2511             for f,v in turls:
2512                 if f == req_format:
2513                     format, rtmp_video_url = f, v
2514                     break
2515
2516             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2517             if not m:
2518                 raise ExtractorError(u'Cannot transform RTMP url')
2519             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2520             video_url = base + m.group('finalid')
2521
2522             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2523             info = {
2524                 'id': shortMediaId,
2525                 'url': video_url,
2526                 'uploader': showId,
2527                 'upload_date': officialDate,
2528                 'title': effTitle,
2529                 'ext': 'mp4',
2530                 'format': format,
2531                 'thumbnail': None,
2532                 'description': officialTitle,
2533             }
2534             results.append(info)
2535
2536         return results
2537
2538
2539 class EscapistIE(InfoExtractor):
2540     """Information extractor for The Escapist """
2541
2542     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2543     IE_NAME = u'escapist'
2544
2545     def report_extraction(self, showName):
2546         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2547
2548     def report_config_download(self, showName):
2549         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2550
2551     def _real_extract(self, url):
2552         mobj = re.match(self._VALID_URL, url)
2553         if mobj is None:
2554             self._downloader.report_error(u'invalid URL: %s' % url)
2555             return
2556         showName = mobj.group('showname')
2557         videoId = mobj.group('episode')
2558
2559         self.report_extraction(showName)
2560         try:
2561             webPage = compat_urllib_request.urlopen(url)
2562             webPageBytes = webPage.read()
2563             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2564             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2565         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2567             return
2568
2569         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2570         description = unescapeHTML(descMatch.group(1))
2571         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2572         imgUrl = unescapeHTML(imgMatch.group(1))
2573         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2574         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2575         configUrlMatch = re.search('config=(.*)$', playerUrl)
2576         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2577
2578         self.report_config_download(showName)
2579         try:
2580             configJSON = compat_urllib_request.urlopen(configUrl)
2581             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2582             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2583         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2584             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2585             return
2586
2587         # Technically, it's JavaScript, not JSON
2588         configJSON = configJSON.replace("'", '"')
2589
2590         try:
2591             config = json.loads(configJSON)
2592         except (ValueError,) as err:
2593             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2594             return
2595
2596         playlist = config['playlist']
2597         videoUrl = playlist[1]['url']
2598
2599         info = {
2600             'id': videoId,
2601             'url': videoUrl,
2602             'uploader': showName,
2603             'upload_date': None,
2604             'title': showName,
2605             'ext': 'mp4',
2606             'thumbnail': imgUrl,
2607             'description': description,
2608             'player_url': playerUrl,
2609         }
2610
2611         return [info]
2612
2613 class CollegeHumorIE(InfoExtractor):
2614     """Information extractor for collegehumor.com"""
2615
2616     _WORKING = False
2617     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2618     IE_NAME = u'collegehumor'
2619
2620     def report_manifest(self, video_id):
2621         """Report information extraction."""
2622         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2623
2624     def report_extraction(self, video_id):
2625         """Report information extraction."""
2626         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2627
2628     def _real_extract(self, url):
2629         mobj = re.match(self._VALID_URL, url)
2630         if mobj is None:
2631             self._downloader.report_error(u'invalid URL: %s' % url)
2632             return
2633         video_id = mobj.group('videoid')
2634
2635         info = {
2636             'id': video_id,
2637             'uploader': None,
2638             'upload_date': None,
2639         }
2640
2641         self.report_extraction(video_id)
2642         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2643         try:
2644             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2645         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2647             return
2648
2649         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2650         try:
2651             videoNode = mdoc.findall('./video')[0]
2652             info['description'] = videoNode.findall('./description')[0].text
2653             info['title'] = videoNode.findall('./caption')[0].text
2654             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2655             manifest_url = videoNode.findall('./file')[0].text
2656         except IndexError:
2657             self._downloader.report_error(u'Invalid metadata XML file')
2658             return
2659
2660         manifest_url += '?hdcore=2.10.3'
2661         self.report_manifest(video_id)
2662         try:
2663             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2664         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2665             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2666             return
2667
2668         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2669         try:
2670             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2671             node_id = media_node.attrib['url']
2672             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2673         except IndexError as err:
2674             self._downloader.report_error(u'Invalid manifest file')
2675             return
2676
2677         url_pr = compat_urllib_parse_urlparse(manifest_url)
2678         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2679
2680         info['url'] = url
2681         info['ext'] = 'f4f'
2682         return [info]
2683
2684
2685 class XVideosIE(InfoExtractor):
2686     """Information extractor for xvideos.com"""
2687
2688     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2689     IE_NAME = u'xvideos'
2690
2691     def report_extraction(self, video_id):
2692         """Report information extraction."""
2693         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2694
2695     def _real_extract(self, url):
2696         mobj = re.match(self._VALID_URL, url)
2697         if mobj is None:
2698             self._downloader.report_error(u'invalid URL: %s' % url)
2699             return
2700         video_id = mobj.group(1)
2701
2702         webpage = self._download_webpage(url, video_id)
2703
2704         self.report_extraction(video_id)
2705
2706
2707         # Extract video URL
2708         mobj = re.search(r'flv_url=(.+?)&', webpage)
2709         if mobj is None:
2710             self._downloader.report_error(u'unable to extract video url')
2711             return
2712         video_url = compat_urllib_parse.unquote(mobj.group(1))
2713
2714
2715         # Extract title
2716         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2717         if mobj is None:
2718             self._downloader.report_error(u'unable to extract video title')
2719             return
2720         video_title = mobj.group(1)
2721
2722
2723         # Extract video thumbnail
2724         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2725         if mobj is None:
2726             self._downloader.report_error(u'unable to extract video thumbnail')
2727             return
2728         video_thumbnail = mobj.group(0)
2729
2730         info = {
2731             'id': video_id,
2732             'url': video_url,
2733             'uploader': None,
2734             'upload_date': None,
2735             'title': video_title,
2736             'ext': 'flv',
2737             'thumbnail': video_thumbnail,
2738             'description': None,
2739         }
2740
2741         return [info]
2742
2743
2744 class SoundcloudIE(InfoExtractor):
2745     """Information extractor for soundcloud.com
2746        To access the media, the uid of the song and a stream token
2747        must be extracted from the page source and the script must make
2748        a request to media.soundcloud.com/crossdomain.xml. Then
2749        the media can be grabbed by requesting from an url composed
2750        of the stream token and uid
2751      """
2752
2753     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2754     IE_NAME = u'soundcloud'
2755
2756     def __init__(self, downloader=None):
2757         InfoExtractor.__init__(self, downloader)
2758
2759     def report_resolve(self, video_id):
2760         """Report information extraction."""
2761         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2762
2763     def report_extraction(self, video_id):
2764         """Report information extraction."""
2765         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2766
2767     def _real_extract(self, url):
2768         mobj = re.match(self._VALID_URL, url)
2769         if mobj is None:
2770             self._downloader.report_error(u'invalid URL: %s' % url)
2771             return
2772
2773         # extract uploader (which is in the url)
2774         uploader = mobj.group(1)
2775         # extract simple title (uploader + slug of song title)
2776         slug_title =  mobj.group(2)
2777         simple_title = uploader + u'-' + slug_title
2778
2779         self.report_resolve('%s/%s' % (uploader, slug_title))
2780
2781         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2782         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2783         request = compat_urllib_request.Request(resolv_url)
2784         try:
2785             info_json_bytes = compat_urllib_request.urlopen(request).read()
2786             info_json = info_json_bytes.decode('utf-8')
2787         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2788             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2789             return
2790
2791         info = json.loads(info_json)
2792         video_id = info['id']
2793         self.report_extraction('%s/%s' % (uploader, slug_title))
2794
2795         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2796         request = compat_urllib_request.Request(streams_url)
2797         try:
2798             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2799             stream_json = stream_json_bytes.decode('utf-8')
2800         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2802             return
2803
2804         streams = json.loads(stream_json)
2805         mediaURL = streams['http_mp3_128_url']
2806
2807         return [{
2808             'id':       info['id'],
2809             'url':      mediaURL,
2810             'uploader': info['user']['username'],
2811             'upload_date':  info['created_at'],
2812             'title':    info['title'],
2813             'ext':      u'mp3',
2814             'description': info['description'],
2815         }]
2816
2817 class SoundcloudSetIE(InfoExtractor):
2818     """Information extractor for soundcloud.com sets
2819        To access the media, the uid of the song and a stream token
2820        must be extracted from the page source and the script must make
2821        a request to media.soundcloud.com/crossdomain.xml. Then
2822        the media can be grabbed by requesting from an url composed
2823        of the stream token and uid
2824      """
2825
2826     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2827     IE_NAME = u'soundcloud'
2828
2829     def __init__(self, downloader=None):
2830         InfoExtractor.__init__(self, downloader)
2831
2832     def report_resolve(self, video_id):
2833         """Report information extraction."""
2834         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2835
2836     def report_extraction(self, video_id):
2837         """Report information extraction."""
2838         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2839
2840     def _real_extract(self, url):
2841         mobj = re.match(self._VALID_URL, url)
2842         if mobj is None:
2843             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2844             return
2845
2846         # extract uploader (which is in the url)
2847         uploader = mobj.group(1)
2848         # extract simple title (uploader + slug of song title)
2849         slug_title =  mobj.group(2)
2850         simple_title = uploader + u'-' + slug_title
2851
2852         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2853
2854         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2855         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2856         request = compat_urllib_request.Request(resolv_url)
2857         try:
2858             info_json_bytes = compat_urllib_request.urlopen(request).read()
2859             info_json = info_json_bytes.decode('utf-8')
2860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2862             return
2863
2864         videos = []
2865         info = json.loads(info_json)
2866         if 'errors' in info:
2867             for err in info['errors']:
2868                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2869             return
2870
2871         for track in info['tracks']:
2872             video_id = track['id']
2873             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2874
2875             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2876             request = compat_urllib_request.Request(streams_url)
2877             try:
2878                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2879                 stream_json = stream_json_bytes.decode('utf-8')
2880             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2881                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2882                 return
2883
2884             streams = json.loads(stream_json)
2885             mediaURL = streams['http_mp3_128_url']
2886
2887             videos.append({
2888                 'id':       video_id,
2889                 'url':      mediaURL,
2890                 'uploader': track['user']['username'],
2891                 'upload_date':  track['created_at'],
2892                 'title':    track['title'],
2893                 'ext':      u'mp3',
2894                 'description': track['description'],
2895             })
2896         return videos
2897
2898
2899 class InfoQIE(InfoExtractor):
2900     """Information extractor for infoq.com"""
2901     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2902
2903     def report_extraction(self, video_id):
2904         """Report information extraction."""
2905         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2906
2907     def _real_extract(self, url):
2908         mobj = re.match(self._VALID_URL, url)
2909         if mobj is None:
2910             self._downloader.report_error(u'invalid URL: %s' % url)
2911             return
2912
2913         webpage = self._download_webpage(url, video_id=url)
2914         self.report_extraction(url)
2915
2916         # Extract video URL
2917         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2918         if mobj is None:
2919             self._downloader.report_error(u'unable to extract video url')
2920             return
2921         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2922         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2923
2924         # Extract title
2925         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2926         if mobj is None:
2927             self._downloader.report_error(u'unable to extract video title')
2928             return
2929         video_title = mobj.group(1)
2930
2931         # Extract description
2932         video_description = u'No description available.'
2933         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2934         if mobj is not None:
2935             video_description = mobj.group(1)
2936
2937         video_filename = video_url.split('/')[-1]
2938         video_id, extension = video_filename.split('.')
2939
2940         info = {
2941             'id': video_id,
2942             'url': video_url,
2943             'uploader': None,
2944             'upload_date': None,
2945             'title': video_title,
2946             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2947             'thumbnail': None,
2948             'description': video_description,
2949         }
2950
2951         return [info]
2952
2953 class MixcloudIE(InfoExtractor):
2954     """Information extractor for www.mixcloud.com"""
2955
2956     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2957     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2958     IE_NAME = u'mixcloud'
2959
2960     def __init__(self, downloader=None):
2961         InfoExtractor.__init__(self, downloader)
2962
2963     def report_download_json(self, file_id):
2964         """Report JSON download."""
2965         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2966
2967     def report_extraction(self, file_id):
2968         """Report information extraction."""
2969         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2970
2971     def get_urls(self, jsonData, fmt, bitrate='best'):
2972         """Get urls from 'audio_formats' section in json"""
2973         file_url = None
2974         try:
2975             bitrate_list = jsonData[fmt]
2976             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2977                 bitrate = max(bitrate_list) # select highest
2978
2979             url_list = jsonData[fmt][bitrate]
2980         except TypeError: # we have no bitrate info.
2981             url_list = jsonData[fmt]
2982         return url_list
2983
2984     def check_urls(self, url_list):
2985         """Returns 1st active url from list"""
2986         for url in url_list:
2987             try:
2988                 compat_urllib_request.urlopen(url)
2989                 return url
2990             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2991                 url = None
2992
2993         return None
2994
2995     def _print_formats(self, formats):
2996         print('Available formats:')
2997         for fmt in formats.keys():
2998             for b in formats[fmt]:
2999                 try:
3000                     ext = formats[fmt][b][0]
3001                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3002                 except TypeError: # we have no bitrate info
3003                     ext = formats[fmt][0]
3004                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3005                     break
3006
3007     def _real_extract(self, url):
3008         mobj = re.match(self._VALID_URL, url)
3009         if mobj is None:
3010             self._downloader.report_error(u'invalid URL: %s' % url)
3011             return
3012         # extract uploader & filename from url
3013         uploader = mobj.group(1).decode('utf-8')
3014         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3015
3016         # construct API request
3017         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3018         # retrieve .json file with links to files
3019         request = compat_urllib_request.Request(file_url)
3020         try:
3021             self.report_download_json(file_url)
3022             jsonData = compat_urllib_request.urlopen(request).read()
3023         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3024             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3025             return
3026
3027         # parse JSON
3028         json_data = json.loads(jsonData)
3029         player_url = json_data['player_swf_url']
3030         formats = dict(json_data['audio_formats'])
3031
3032         req_format = self._downloader.params.get('format', None)
3033         bitrate = None
3034
3035         if self._downloader.params.get('listformats', None):
3036             self._print_formats(formats)
3037             return
3038
3039         if req_format is None or req_format == 'best':
3040             for format_param in formats.keys():
3041                 url_list = self.get_urls(formats, format_param)
3042                 # check urls
3043                 file_url = self.check_urls(url_list)
3044                 if file_url is not None:
3045                     break # got it!
3046         else:
3047             if req_format not in formats:
3048                 self._downloader.report_error(u'format is not available')
3049                 return
3050
3051             url_list = self.get_urls(formats, req_format)
3052             file_url = self.check_urls(url_list)
3053             format_param = req_format
3054
3055         return [{
3056             'id': file_id.decode('utf-8'),
3057             'url': file_url.decode('utf-8'),
3058             'uploader': uploader.decode('utf-8'),
3059             'upload_date': None,
3060             'title': json_data['name'],
3061             'ext': file_url.split('.')[-1].decode('utf-8'),
3062             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3063             'thumbnail': json_data['thumbnail_url'],
3064             'description': json_data['description'],
3065             'player_url': player_url.decode('utf-8'),
3066         }]
3067
3068 class StanfordOpenClassroomIE(InfoExtractor):
3069     """Information extractor for Stanford's Open ClassRoom"""
3070
3071     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3072     IE_NAME = u'stanfordoc'
3073
3074     def report_download_webpage(self, objid):
3075         """Report information extraction."""
3076         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3077
3078     def report_extraction(self, video_id):
3079         """Report information extraction."""
3080         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3081
3082     def _real_extract(self, url):
3083         mobj = re.match(self._VALID_URL, url)
3084         if mobj is None:
3085             raise ExtractorError(u'Invalid URL: %s' % url)
3086
3087         if mobj.group('course') and mobj.group('video'): # A specific video
3088             course = mobj.group('course')
3089             video = mobj.group('video')
3090             info = {
3091                 'id': course + '_' + video,
3092                 'uploader': None,
3093                 'upload_date': None,
3094             }
3095
3096             self.report_extraction(info['id'])
3097             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3098             xmlUrl = baseUrl + video + '.xml'
3099             try:
3100                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3101             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3102                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3103                 return
3104             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3105             try:
3106                 info['title'] = mdoc.findall('./title')[0].text
3107                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3108             except IndexError:
3109                 self._downloader.report_error(u'Invalid metadata XML file')
3110                 return
3111             info['ext'] = info['url'].rpartition('.')[2]
3112             return [info]
3113         elif mobj.group('course'): # A course page
3114             course = mobj.group('course')
3115             info = {
3116                 'id': course,
3117                 'type': 'playlist',
3118                 'uploader': None,
3119                 'upload_date': None,
3120             }
3121
3122             coursepage = self._download_webpage(url, info['id'],
3123                                         note='Downloading course info page',
3124                                         errnote='Unable to download course info page')
3125
3126             m = re.search('<h1>([^<]+)</h1>', coursepage)
3127             if m:
3128                 info['title'] = unescapeHTML(m.group(1))
3129             else:
3130                 info['title'] = info['id']
3131
3132             m = re.search('<description>([^<]+)</description>', coursepage)
3133             if m:
3134                 info['description'] = unescapeHTML(m.group(1))
3135
3136             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3137             info['list'] = [
3138                 {
3139                     'type': 'reference',
3140                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3141                 }
3142                     for vpage in links]
3143             results = []
3144             for entry in info['list']:
3145                 assert entry['type'] == 'reference'
3146                 results += self.extract(entry['url'])
3147             return results
3148         else: # Root page
3149             info = {
3150                 'id': 'Stanford OpenClassroom',
3151                 'type': 'playlist',
3152                 'uploader': None,
3153                 'upload_date': None,
3154             }
3155
3156             self.report_download_webpage(info['id'])
3157             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3158             try:
3159                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3160             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3162                 return
3163
3164             info['title'] = info['id']
3165
3166             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3167             info['list'] = [
3168                 {
3169                     'type': 'reference',
3170                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3171                 }
3172                     for cpage in links]
3173
3174             results = []
3175             for entry in info['list']:
3176                 assert entry['type'] == 'reference'
3177                 results += self.extract(entry['url'])
3178             return results
3179
3180 class MTVIE(InfoExtractor):
3181     """Information extractor for MTV.com"""
3182
3183     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3184     IE_NAME = u'mtv'
3185
3186     def report_extraction(self, video_id):
3187         """Report information extraction."""
3188         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3189
3190     def _real_extract(self, url):
3191         mobj = re.match(self._VALID_URL, url)
3192         if mobj is None:
3193             self._downloader.report_error(u'invalid URL: %s' % url)
3194             return
3195         if not mobj.group('proto'):
3196             url = 'http://' + url
3197         video_id = mobj.group('videoid')
3198
3199         webpage = self._download_webpage(url, video_id)
3200
3201         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3202         if mobj is None:
3203             self._downloader.report_error(u'unable to extract song name')
3204             return
3205         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3206         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3207         if mobj is None:
3208             self._downloader.report_error(u'unable to extract performer')
3209             return
3210         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211         video_title = performer + ' - ' + song_name
3212
3213         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3214         if mobj is None:
3215             self._downloader.report_error(u'unable to mtvn_uri')
3216             return
3217         mtvn_uri = mobj.group(1)
3218
3219         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3220         if mobj is None:
3221             self._downloader.report_error(u'unable to extract content id')
3222             return
3223         content_id = mobj.group(1)
3224
3225         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3226         self.report_extraction(video_id)
3227         request = compat_urllib_request.Request(videogen_url)
3228         try:
3229             metadataXml = compat_urllib_request.urlopen(request).read()
3230         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3231             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3232             return
3233
3234         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3235         renditions = mdoc.findall('.//rendition')
3236
3237         # For now, always pick the highest quality.
3238         rendition = renditions[-1]
3239
3240         try:
3241             _,_,ext = rendition.attrib['type'].partition('/')
3242             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3243             video_url = rendition.find('./src').text
3244         except KeyError:
3245             self._downloader.trouble('Invalid rendition field.')
3246             return
3247
3248         info = {
3249             'id': video_id,
3250             'url': video_url,
3251             'uploader': performer,
3252             'upload_date': None,
3253             'title': video_title,
3254             'ext': ext,
3255             'format': format,
3256         }
3257
3258         return [info]
3259
3260
3261 class YoukuIE(InfoExtractor):
3262     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3263
3264     def report_download_webpage(self, file_id):
3265         """Report webpage download."""
3266         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3267
3268     def report_extraction(self, file_id):
3269         """Report information extraction."""
3270         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3271
3272     def _gen_sid(self):
3273         nowTime = int(time.time() * 1000)
3274         random1 = random.randint(1000,1998)
3275         random2 = random.randint(1000,9999)
3276
3277         return "%d%d%d" %(nowTime,random1,random2)
3278
3279     def _get_file_ID_mix_string(self, seed):
3280         mixed = []
3281         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3282         seed = float(seed)
3283         for i in range(len(source)):
3284             seed  =  (seed * 211 + 30031 ) % 65536
3285             index  =  math.floor(seed / 65536 * len(source) )
3286             mixed.append(source[int(index)])
3287             source.remove(source[int(index)])
3288         #return ''.join(mixed)
3289         return mixed
3290
3291     def _get_file_id(self, fileId, seed):
3292         mixed = self._get_file_ID_mix_string(seed)
3293         ids = fileId.split('*')
3294         realId = []
3295         for ch in ids:
3296             if ch:
3297                 realId.append(mixed[int(ch)])
3298         return ''.join(realId)
3299
3300     def _real_extract(self, url):
3301         mobj = re.match(self._VALID_URL, url)
3302         if mobj is None:
3303             self._downloader.report_error(u'invalid URL: %s' % url)
3304             return
3305         video_id = mobj.group('ID')
3306
3307         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3308
3309         request = compat_urllib_request.Request(info_url, None, std_headers)
3310         try:
3311             self.report_download_webpage(video_id)
3312             jsondata = compat_urllib_request.urlopen(request).read()
3313         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3314             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3315             return
3316
3317         self.report_extraction(video_id)
3318         try:
3319             jsonstr = jsondata.decode('utf-8')
3320             config = json.loads(jsonstr)
3321
3322             video_title =  config['data'][0]['title']
3323             seed = config['data'][0]['seed']
3324
3325             format = self._downloader.params.get('format', None)
3326             supported_format = list(config['data'][0]['streamfileids'].keys())
3327
3328             if format is None or format == 'best':
3329                 if 'hd2' in supported_format:
3330                     format = 'hd2'
3331                 else:
3332                     format = 'flv'
3333                 ext = u'flv'
3334             elif format == 'worst':
3335                 format = 'mp4'
3336                 ext = u'mp4'
3337             else:
3338                 format = 'flv'
3339                 ext = u'flv'
3340
3341
3342             fileid = config['data'][0]['streamfileids'][format]
3343             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3344         except (UnicodeDecodeError, ValueError, KeyError):
3345             self._downloader.report_error(u'unable to extract info section')
3346             return
3347
3348         files_info=[]
3349         sid = self._gen_sid()
3350         fileid = self._get_file_id(fileid, seed)
3351
3352         #column 8,9 of fileid represent the segment number
3353         #fileid[7:9] should be changed
3354         for index, key in enumerate(keys):
3355
3356             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3357             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3358
3359             info = {
3360                 'id': '%s_part%02d' % (video_id, index),
3361                 'url': download_url,
3362                 'uploader': None,
3363                 'upload_date': None,
3364                 'title': video_title,
3365                 'ext': ext,
3366             }
3367             files_info.append(info)
3368
3369         return files_info
3370
3371
3372 class XNXXIE(InfoExtractor):
3373     """Information extractor for xnxx.com"""
3374
3375     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3376     IE_NAME = u'xnxx'
3377     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3378     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3379     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3380
3381     def report_webpage(self, video_id):
3382         """Report information extraction"""
3383         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3384
3385     def report_extraction(self, video_id):
3386         """Report information extraction"""
3387         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3388
3389     def _real_extract(self, url):
3390         mobj = re.match(self._VALID_URL, url)
3391         if mobj is None:
3392             self._downloader.report_error(u'invalid URL: %s' % url)
3393             return
3394         video_id = mobj.group(1)
3395
3396         self.report_webpage(video_id)
3397
3398         # Get webpage content
3399         try:
3400             webpage_bytes = compat_urllib_request.urlopen(url).read()
3401             webpage = webpage_bytes.decode('utf-8')
3402         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3403             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3404             return
3405
3406         result = re.search(self.VIDEO_URL_RE, webpage)
3407         if result is None:
3408             self._downloader.report_error(u'unable to extract video url')
3409             return
3410         video_url = compat_urllib_parse.unquote(result.group(1))
3411
3412         result = re.search(self.VIDEO_TITLE_RE, webpage)
3413         if result is None:
3414             self._downloader.report_error(u'unable to extract video title')
3415             return
3416         video_title = result.group(1)
3417
3418         result = re.search(self.VIDEO_THUMB_RE, webpage)
3419         if result is None:
3420             self._downloader.report_error(u'unable to extract video thumbnail')
3421             return
3422         video_thumbnail = result.group(1)
3423
3424         return [{
3425             'id': video_id,
3426             'url': video_url,
3427             'uploader': None,
3428             'upload_date': None,
3429             'title': video_title,
3430             'ext': 'flv',
3431             'thumbnail': video_thumbnail,
3432             'description': None,
3433         }]
3434
3435
3436 class GooglePlusIE(InfoExtractor):
3437     """Information extractor for plus.google.com."""
3438
3439     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3440     IE_NAME = u'plus.google'
3441
3442     def __init__(self, downloader=None):
3443         InfoExtractor.__init__(self, downloader)
3444
3445     def report_extract_entry(self, url):
3446         """Report downloading extry"""
3447         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3448
3449     def report_date(self, upload_date):
3450         """Report downloading extry"""
3451         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3452
3453     def report_uploader(self, uploader):
3454         """Report downloading extry"""
3455         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3456
3457     def report_title(self, video_title):
3458         """Report downloading extry"""
3459         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3460
3461     def report_extract_vid_page(self, video_page):
3462         """Report information extraction."""
3463         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3464
3465     def _real_extract(self, url):
3466         # Extract id from URL
3467         mobj = re.match(self._VALID_URL, url)
3468         if mobj is None:
3469             self._downloader.report_error(u'Invalid URL: %s' % url)
3470             return
3471
3472         post_url = mobj.group(0)
3473         video_id = mobj.group(1)
3474
3475         video_extension = 'flv'
3476
3477         # Step 1, Retrieve post webpage to extract further information
3478         self.report_extract_entry(post_url)
3479         request = compat_urllib_request.Request(post_url)
3480         try:
3481             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3484             return
3485
3486         # Extract update date
3487         upload_date = None
3488         pattern = 'title="Timestamp">(.*?)</a>'
3489         mobj = re.search(pattern, webpage)
3490         if mobj:
3491             upload_date = mobj.group(1)
3492             # Convert timestring to a format suitable for filename
3493             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3494             upload_date = upload_date.strftime('%Y%m%d')
3495         self.report_date(upload_date)
3496
3497         # Extract uploader
3498         uploader = None
3499         pattern = r'rel\="author".*?>(.*?)</a>'
3500         mobj = re.search(pattern, webpage)
3501         if mobj:
3502             uploader = mobj.group(1)
3503         self.report_uploader(uploader)
3504
3505         # Extract title
3506         # Get the first line for title
3507         video_title = u'NA'
3508         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3509         mobj = re.search(pattern, webpage)
3510         if mobj:
3511             video_title = mobj.group(1)
3512         self.report_title(video_title)
3513
3514         # Step 2, Stimulate clicking the image box to launch video
3515         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3516         mobj = re.search(pattern, webpage)
3517         if mobj is None:
3518             self._downloader.report_error(u'unable to extract video page URL')
3519
3520         video_page = mobj.group(1)
3521         request = compat_urllib_request.Request(video_page)
3522         try:
3523             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3524         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3525             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3526             return
3527         self.report_extract_vid_page(video_page)
3528
3529
3530         # Extract video links on video page
3531         """Extract video links of all sizes"""
3532         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3533         mobj = re.findall(pattern, webpage)
3534         if len(mobj) == 0:
3535             self._downloader.report_error(u'unable to extract video links')
3536
3537         # Sort in resolution
3538         links = sorted(mobj)
3539
3540         # Choose the lowest of the sort, i.e. highest resolution
3541         video_url = links[-1]
3542         # Only get the url. The resolution part in the tuple has no use anymore
3543         video_url = video_url[-1]
3544         # Treat escaped \u0026 style hex
3545         try:
3546             video_url = video_url.decode("unicode_escape")
3547         except AttributeError: # Python 3
3548             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3549
3550
3551         return [{
3552             'id':       video_id,
3553             'url':      video_url,
3554             'uploader': uploader,
3555             'upload_date':  upload_date,
3556             'title':    video_title,
3557             'ext':      video_extension,
3558         }]
3559
3560 class NBAIE(InfoExtractor):
3561     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3562     IE_NAME = u'nba'
3563
3564     def _real_extract(self, url):
3565         mobj = re.match(self._VALID_URL, url)
3566         if mobj is None:
3567             self._downloader.report_error(u'invalid URL: %s' % url)
3568             return
3569
3570         video_id = mobj.group(1)
3571         if video_id.endswith('/index.html'):
3572             video_id = video_id[:-len('/index.html')]
3573
3574         webpage = self._download_webpage(url, video_id)
3575
3576         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3577         def _findProp(rexp, default=None):
3578             m = re.search(rexp, webpage)
3579             if m:
3580                 return unescapeHTML(m.group(1))
3581             else:
3582                 return default
3583
3584         shortened_video_id = video_id.rpartition('/')[2]
3585         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3586         info = {
3587             'id': shortened_video_id,
3588             'url': video_url,
3589             'ext': 'mp4',
3590             'title': title,
3591             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3592             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3593         }
3594         return [info]
3595
3596 class JustinTVIE(InfoExtractor):
3597     """Information extractor for justin.tv and twitch.tv"""
3598     # TODO: One broadcast may be split into multiple videos. The key
3599     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3600     # starts at 1 and increases. Can we treat all parts as one video?
3601
3602     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3603         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3604     _JUSTIN_PAGE_LIMIT = 100
3605     IE_NAME = u'justin.tv'
3606
3607     def report_extraction(self, file_id):
3608         """Report information extraction."""
3609         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3610
3611     def report_download_page(self, channel, offset):
3612         """Report attempt to download a single page of videos."""
3613         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3614                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3615
3616     # Return count of items, list of *valid* items
3617     def _parse_page(self, url):
3618         try:
3619             urlh = compat_urllib_request.urlopen(url)
3620             webpage_bytes = urlh.read()
3621             webpage = webpage_bytes.decode('utf-8', 'ignore')
3622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3623             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3624             return
3625
3626         response = json.loads(webpage)
3627         if type(response) != list:
3628             error_text = response.get('error', 'unknown error')
3629             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3630             return
3631         info = []
3632         for clip in response:
3633             video_url = clip['video_file_url']
3634             if video_url:
3635                 video_extension = os.path.splitext(video_url)[1][1:]
3636                 video_date = re.sub('-', '', clip['start_time'][:10])
3637                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3638                 video_id = clip['id']
3639                 video_title = clip.get('title', video_id)
3640                 info.append({
3641                     'id': video_id,
3642                     'url': video_url,
3643                     'title': video_title,
3644                     'uploader': clip.get('channel_name', video_uploader_id),
3645                     'uploader_id': video_uploader_id,
3646                     'upload_date': video_date,
3647                     'ext': video_extension,
3648                 })
3649         return (len(response), info)
3650
3651     def _real_extract(self, url):
3652         mobj = re.match(self._VALID_URL, url)
3653         if mobj is None:
3654             self._downloader.report_error(u'invalid URL: %s' % url)
3655             return
3656
3657         api = 'http://api.justin.tv'
3658         video_id = mobj.group(mobj.lastindex)
3659         paged = False
3660         if mobj.lastindex == 1:
3661             paged = True
3662             api += '/channel/archives/%s.json'
3663         else:
3664             api += '/broadcast/by_archive/%s.json'
3665         api = api % (video_id,)
3666
3667         self.report_extraction(video_id)
3668
3669         info = []
3670         offset = 0
3671         limit = self._JUSTIN_PAGE_LIMIT
3672         while True:
3673             if paged:
3674                 self.report_download_page(video_id, offset)
3675             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3676             page_count, page_info = self._parse_page(page_url)
3677             info.extend(page_info)
3678             if not paged or page_count != limit:
3679                 break
3680             offset += limit
3681         return info
3682
3683 class FunnyOrDieIE(InfoExtractor):
3684     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3685
3686     def _real_extract(self, url):
3687         mobj = re.match(self._VALID_URL, url)
3688         if mobj is None:
3689             self._downloader.report_error(u'invalid URL: %s' % url)
3690             return
3691
3692         video_id = mobj.group('id')
3693         webpage = self._download_webpage(url, video_id)
3694
3695         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3696         if not m:
3697             self._downloader.report_error(u'unable to find video information')
3698         video_url = unescapeHTML(m.group('url'))
3699
3700         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3701         if not m:
3702             self._downloader.trouble(u'Cannot find video title')
3703         title = clean_html(m.group('title'))
3704
3705         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3706         if m:
3707             desc = unescapeHTML(m.group('desc'))
3708         else:
3709             desc = None
3710
3711         info = {
3712             'id': video_id,
3713             'url': video_url,
3714             'ext': 'mp4',
3715             'title': title,
3716             'description': desc,
3717         }
3718         return [info]
3719
3720 class SteamIE(InfoExtractor):
3721     _VALID_URL = r"""http://store.steampowered.com/
3722                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3723                 (?P<gameID>\d+)/?
3724                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3725                 """
3726
3727     @classmethod
3728     def suitable(cls, url):
3729         """Receives a URL and returns True if suitable for this IE."""
3730         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3731
3732     def _real_extract(self, url):
3733         m = re.match(self._VALID_URL, url, re.VERBOSE)
3734         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3735         gameID = m.group('gameID')
3736         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3737         webpage = self._download_webpage(videourl, gameID)
3738         mweb = re.finditer(urlRE, webpage)
3739         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3740         titles = re.finditer(namesRE, webpage)
3741         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3742         thumbs = re.finditer(thumbsRE, webpage)
3743         videos = []
3744         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3745             video_id = vid.group('videoID')
3746             title = vtitle.group('videoName')
3747             video_url = vid.group('videoURL')
3748             video_thumb = thumb.group('thumbnail')
3749             if not video_url:
3750                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3751             info = {
3752                 'id':video_id,
3753                 'url':video_url,
3754                 'ext': 'flv',
3755                 'title': unescapeHTML(title),
3756                 'thumbnail': video_thumb
3757                   }
3758             videos.append(info)
3759         return videos
3760
3761 class UstreamIE(InfoExtractor):
3762     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3763     IE_NAME = u'ustream'
3764
3765     def _real_extract(self, url):
3766         m = re.match(self._VALID_URL, url)
3767         video_id = m.group('videoID')
3768         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3769         webpage = self._download_webpage(url, video_id)
3770         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3771         title = m.group('title')
3772         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3773         uploader = m.group('uploader')
3774         info = {
3775                 'id':video_id,
3776                 'url':video_url,
3777                 'ext': 'flv',
3778                 'title': title,
3779                 'uploader': uploader
3780                   }
3781         return [info]
3782
3783 class WorldStarHipHopIE(InfoExtractor):
3784     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3785     IE_NAME = u'WorldStarHipHop'
3786
3787     def _real_extract(self, url):
3788         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3789
3790         webpage_src = compat_urllib_request.urlopen(url).read()
3791         webpage_src = webpage_src.decode('utf-8')
3792
3793         mobj = re.search(_src_url, webpage_src)
3794
3795         m = re.match(self._VALID_URL, url)
3796         video_id = m.group('id')
3797
3798         if mobj is not None:
3799             video_url = mobj.group()
3800             if 'mp4' in video_url:
3801                 ext = 'mp4'
3802             else:
3803                 ext = 'flv'
3804         else:
3805             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3806             return
3807
3808         _title = r"""<title>(.*)</title>"""
3809
3810         mobj = re.search(_title, webpage_src)
3811         
3812         if mobj is not None:
3813             title = mobj.group(1)
3814         else:
3815             title = 'World Start Hip Hop - %s' % time.ctime()
3816
3817         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3818         mobj = re.search(_thumbnail, webpage_src)
3819
3820         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3821         if mobj is not None:
3822             thumbnail = mobj.group(1)
3823         else:
3824             _title = r"""candytitles.*>(.*)</span>"""
3825             mobj = re.search(_title, webpage_src)
3826             if mobj is not None:
3827                 title = mobj.group(1)
3828             thumbnail = None
3829         
3830         results = [{
3831                     'id': video_id,
3832                     'url' : video_url,
3833                     'title' : title,
3834                     'thumbnail' : thumbnail,
3835                     'ext' : ext,
3836                     }]
3837         return results
3838
3839 class RBMARadioIE(InfoExtractor):
3840     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3841
3842     def _real_extract(self, url):
3843         m = re.match(self._VALID_URL, url)
3844         video_id = m.group('videoID')
3845
3846         webpage = self._download_webpage(url, video_id)
3847         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3848         if not m:
3849             raise ExtractorError(u'Cannot find metadata')
3850         json_data = m.group(1)
3851
3852         try:
3853             data = json.loads(json_data)
3854         except ValueError as e:
3855             raise ExtractorError(u'Invalid JSON: ' + str(e))
3856
3857         video_url = data['akamai_url'] + '&cbr=256'
3858         url_parts = compat_urllib_parse_urlparse(video_url)
3859         video_ext = url_parts.path.rpartition('.')[2]
3860         info = {
3861                 'id': video_id,
3862                 'url': video_url,
3863                 'ext': video_ext,
3864                 'title': data['title'],
3865                 'description': data.get('teaser_text'),
3866                 'location': data.get('country_of_origin'),
3867                 'uploader': data.get('host', {}).get('name'),
3868                 'uploader_id': data.get('host', {}).get('slug'),
3869                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3870                 'duration': data.get('duration'),
3871         }
3872         return [info]
3873
3874
3875 class YouPornIE(InfoExtractor):
3876     """Information extractor for youporn.com."""
3877     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3878
3879     def _print_formats(self, formats):
3880         """Print all available formats"""
3881         print(u'Available formats:')
3882         print(u'ext\t\tformat')
3883         print(u'---------------------------------')
3884         for format in formats:
3885             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3886
3887     def _specific(self, req_format, formats):
3888         for x in formats:
3889             if(x["format"]==req_format):
3890                 return x
3891         return None
3892
3893     def _real_extract(self, url):
3894         mobj = re.match(self._VALID_URL, url)
3895         if mobj is None:
3896             self._downloader.report_error(u'invalid URL: %s' % url)
3897             return
3898
3899         video_id = mobj.group('videoid')
3900
3901         req = compat_urllib_request.Request(url)
3902         req.add_header('Cookie', 'age_verified=1')
3903         webpage = self._download_webpage(req, video_id)
3904
3905         # Get the video title
3906         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3907         if result is None:
3908             raise ExtractorError(u'Unable to extract video title')
3909         video_title = result.group('title').strip()
3910
3911         # Get the video date
3912         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3913         if result is None:
3914             self._downloader.report_warning(u'unable to extract video date')
3915             upload_date = None
3916         else:
3917             upload_date = result.group('date').strip()
3918
3919         # Get the video uploader
3920         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3921         if result is None:
3922             self._downloader.report_warning(u'unable to extract uploader')
3923             video_uploader = None
3924         else:
3925             video_uploader = result.group('uploader').strip()
3926             video_uploader = clean_html( video_uploader )
3927
3928         # Get all of the formats available
3929         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3930         result = re.search(DOWNLOAD_LIST_RE, webpage)
3931         if result is None:
3932             raise ExtractorError(u'Unable to extract download list')
3933         download_list_html = result.group('download_list').strip()
3934
3935         # Get all of the links from the page
3936         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3937         links = re.findall(LINK_RE, download_list_html)
3938         if(len(links) == 0):
3939             raise ExtractorError(u'ERROR: no known formats available for video')
3940
3941         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3942
3943         formats = []
3944         for link in links:
3945
3946             # A link looks like this:
3947             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3948             # A path looks like this:
3949             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3950             video_url = unescapeHTML( link )
3951             path = compat_urllib_parse_urlparse( video_url ).path
3952             extension = os.path.splitext( path )[1][1:]
3953             format = path.split('/')[4].split('_')[:2]
3954             size = format[0]
3955             bitrate = format[1]
3956             format = "-".join( format )
3957             title = u'%s-%s-%s' % (video_title, size, bitrate)
3958
3959             formats.append({
3960                 'id': video_id,
3961                 'url': video_url,
3962                 'uploader': video_uploader,
3963                 'upload_date': upload_date,
3964                 'title': title,
3965                 'ext': extension,
3966                 'format': format,
3967                 'thumbnail': None,
3968                 'description': None,
3969                 'player_url': None
3970             })
3971
3972         if self._downloader.params.get('listformats', None):
3973             self._print_formats(formats)
3974             return
3975
3976         req_format = self._downloader.params.get('format', None)
3977         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3978
3979         if req_format is None or req_format == 'best':
3980             return [formats[0]]
3981         elif req_format == 'worst':
3982             return [formats[-1]]
3983         elif req_format in ('-1', 'all'):
3984             return formats
3985         else:
3986             format = self._specific( req_format, formats )
3987             if result is None:
3988                 self._downloader.report_error(u'requested format not available')
3989                 return
3990             return [format]
3991
3992
3993
3994 class PornotubeIE(InfoExtractor):
3995     """Information extractor for pornotube.com."""
3996     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3997
3998     def _real_extract(self, url):
3999         mobj = re.match(self._VALID_URL, url)
4000         if mobj is None:
4001             self._downloader.report_error(u'invalid URL: %s' % url)
4002             return
4003
4004         video_id = mobj.group('videoid')
4005         video_title = mobj.group('title')
4006
4007         # Get webpage content
4008         webpage = self._download_webpage(url, video_id)
4009
4010         # Get the video URL
4011         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4012         result = re.search(VIDEO_URL_RE, webpage)
4013         if result is None:
4014             self._downloader.report_error(u'unable to extract video url')
4015             return
4016         video_url = compat_urllib_parse.unquote(result.group('url'))
4017
4018         #Get the uploaded date
4019         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4020         result = re.search(VIDEO_UPLOADED_RE, webpage)
4021         if result is None:
4022             self._downloader.report_error(u'unable to extract video title')
4023             return
4024         upload_date = result.group('date')
4025
4026         info = {'id': video_id,
4027                 'url': video_url,
4028                 'uploader': None,
4029                 'upload_date': upload_date,
4030                 'title': video_title,
4031                 'ext': 'flv',
4032                 'format': 'flv'}
4033
4034         return [info]
4035
4036 class YouJizzIE(InfoExtractor):
4037     """Information extractor for youjizz.com."""
4038     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4039
4040     def _real_extract(self, url):
4041         mobj = re.match(self._VALID_URL, url)
4042         if mobj is None:
4043             self._downloader.report_error(u'invalid URL: %s' % url)
4044             return
4045
4046         video_id = mobj.group('videoid')
4047
4048         # Get webpage content
4049         webpage = self._download_webpage(url, video_id)
4050
4051         # Get the video title
4052         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4053         if result is None:
4054             raise ExtractorError(u'ERROR: unable to extract video title')
4055         video_title = result.group('title').strip()
4056
4057         # Get the embed page
4058         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4059         if result is None:
4060             raise ExtractorError(u'ERROR: unable to extract embed page')
4061
4062         embed_page_url = result.group(0).strip()
4063         video_id = result.group('videoid')
4064
4065         webpage = self._download_webpage(embed_page_url, video_id)
4066
4067         # Get the video URL
4068         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4069         if result is None:
4070             raise ExtractorError(u'ERROR: unable to extract video url')
4071         video_url = result.group('source')
4072
4073         info = {'id': video_id,
4074                 'url': video_url,
4075                 'title': video_title,
4076                 'ext': 'flv',
4077                 'format': 'flv',
4078                 'player_url': embed_page_url}
4079
4080         return [info]
4081
4082 class EightTracksIE(InfoExtractor):
4083     IE_NAME = '8tracks'
4084     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4085
4086     def _real_extract(self, url):
4087         mobj = re.match(self._VALID_URL, url)
4088         if mobj is None:
4089             raise ExtractorError(u'Invalid URL: %s' % url)
4090         playlist_id = mobj.group('id')
4091
4092         webpage = self._download_webpage(url, playlist_id)
4093
4094         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4095         if not m:
4096             raise ExtractorError(u'Cannot find trax information')
4097         json_like = m.group(1)
4098         data = json.loads(json_like)
4099
4100         session = str(random.randint(0, 1000000000))
4101         mix_id = data['id']
4102         track_count = data['tracks_count']
4103         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4104         next_url = first_url
4105         res = []
4106         for i in itertools.count():
4107             api_json = self._download_webpage(next_url, playlist_id,
4108                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4109                 errnote=u'Failed to download song information')
4110             api_data = json.loads(api_json)
4111             track_data = api_data[u'set']['track']
4112             info = {
4113                 'id': track_data['id'],
4114                 'url': track_data['track_file_stream_url'],
4115                 'title': track_data['performer'] + u' - ' + track_data['name'],
4116                 'raw_title': track_data['name'],
4117                 'uploader_id': data['user']['login'],
4118                 'ext': 'm4a',
4119             }
4120             res.append(info)
4121             if api_data['set']['at_last_track']:
4122                 break
4123             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4124         return res
4125
4126 class KeekIE(InfoExtractor):
4127     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4128     IE_NAME = u'keek'
4129
4130     def _real_extract(self, url):
4131         m = re.match(self._VALID_URL, url)
4132         video_id = m.group('videoID')
4133         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4134         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4135         webpage = self._download_webpage(url, video_id)
4136         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4137         title = unescapeHTML(m.group('title'))
4138         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4139         uploader = clean_html(m.group('uploader'))
4140         info = {
4141                 'id': video_id,
4142                 'url': video_url,
4143                 'ext': 'mp4',
4144                 'title': title,
4145                 'thumbnail': thumbnail,
4146                 'uploader': uploader
4147         }
4148         return [info]
4149
4150 class TEDIE(InfoExtractor):
4151     _VALID_URL=r'''http://www.ted.com/
4152                    (
4153                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4154                         |
4155                         ((?P<type_talk>talks)) # We have a simple talk
4156                    )
4157                    /(?P<name>\w+) # Here goes the name and then ".html"
4158                    '''
4159
4160     @classmethod
4161     def suitable(cls, url):
4162         """Receives a URL and returns True if suitable for this IE."""
4163         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4164
4165     def _real_extract(self, url):
4166         m=re.match(self._VALID_URL, url, re.VERBOSE)
4167         if m.group('type_talk'):
4168             return [self._talk_info(url)]
4169         else :
4170             playlist_id=m.group('playlist_id')
4171             name=m.group('name')
4172             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4173             return self._playlist_videos_info(url,name,playlist_id)
4174
4175     def _talk_video_link(self,mediaSlug):
4176         '''Returns the video link for that mediaSlug'''
4177         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4178
4179     def _playlist_videos_info(self,url,name,playlist_id=0):
4180         '''Returns the videos of the playlist'''
4181         video_RE=r'''
4182                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4183                      ([.\s]*?)data-playlist_item_id="(\d+)"
4184                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4185                      '''
4186         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4187         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4188         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4189         m_names=re.finditer(video_name_RE,webpage)
4190         info=[]
4191         for m_video, m_name in zip(m_videos,m_names):
4192             video_id=m_video.group('video_id')
4193             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4194             info.append(self._talk_info(talk_url,video_id))
4195         return info
4196
4197     def _talk_info(self, url, video_id=0):
4198         """Return the video for the talk in the url"""
4199         m=re.match(self._VALID_URL, url,re.VERBOSE)
4200         videoName=m.group('name')
4201         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4202         # If the url includes the language we get the title translated
4203         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4204         title=re.search(title_RE, webpage).group('title')
4205         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4206                         "id":(?P<videoID>[\d]+).*?
4207                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4208         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4209         thumb_match=re.search(thumb_RE,webpage)
4210         info_match=re.search(info_RE,webpage,re.VERBOSE)
4211         video_id=info_match.group('videoID')
4212         mediaSlug=info_match.group('mediaSlug')
4213         video_url=self._talk_video_link(mediaSlug)
4214         info = {
4215                 'id': video_id,
4216                 'url': video_url,
4217                 'ext': 'mp4',
4218                 'title': title,
4219                 'thumbnail': thumb_match.group('thumbnail')
4220                 }
4221         return info
4222
4223 class MySpassIE(InfoExtractor):
4224     _VALID_URL = r'http://www.myspass.de/.*'
4225
4226     def _real_extract(self, url):
4227         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4228
4229         # video id is the last path element of the URL
4230         # usually there is a trailing slash, so also try the second but last
4231         url_path = compat_urllib_parse_urlparse(url).path
4232         url_parent_path, video_id = os.path.split(url_path)
4233         if not video_id:
4234             _, video_id = os.path.split(url_parent_path)
4235
4236         # get metadata
4237         metadata_url = META_DATA_URL_TEMPLATE % video_id
4238         metadata_text = self._download_webpage(metadata_url, video_id)
4239         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4240
4241         # extract values from metadata
4242         url_flv_el = metadata.find('url_flv')
4243         if url_flv_el is None:
4244             self._downloader.report_error(u'unable to extract download url')
4245             return
4246         video_url = url_flv_el.text
4247         extension = os.path.splitext(video_url)[1][1:]
4248         title_el = metadata.find('title')
4249         if title_el is None:
4250             self._downloader.report_error(u'unable to extract title')
4251             return
4252         title = title_el.text
4253         format_id_el = metadata.find('format_id')
4254         if format_id_el is None:
4255             format = ext
4256         else:
4257             format = format_id_el.text
4258         description_el = metadata.find('description')
4259         if description_el is not None:
4260             description = description_el.text
4261         else:
4262             description = None
4263         imagePreview_el = metadata.find('imagePreview')
4264         if imagePreview_el is not None:
4265             thumbnail = imagePreview_el.text
4266         else:
4267             thumbnail = None
4268         info = {
4269             'id': video_id,
4270             'url': video_url,
4271             'title': title,
4272             'ext': extension,
4273             'format': format,
4274             'thumbnail': thumbnail,
4275             'description': description
4276         }
4277         return [info]
4278
4279 class SpiegelIE(InfoExtractor):
4280     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4281
4282     def _real_extract(self, url):
4283         m = re.match(self._VALID_URL, url)
4284         video_id = m.group('videoID')
4285
4286         webpage = self._download_webpage(url, video_id)
4287         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4288         if not m:
4289             raise ExtractorError(u'Cannot find title')
4290         video_title = unescapeHTML(m.group(1))
4291
4292         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4293         xml_code = self._download_webpage(xml_url, video_id,
4294                     note=u'Downloading XML', errnote=u'Failed to download XML')
4295
4296         idoc = xml.etree.ElementTree.fromstring(xml_code)
4297         last_type = idoc[-1]
4298         filename = last_type.findall('./filename')[0].text
4299         duration = float(last_type.findall('./duration')[0].text)
4300
4301         video_url = 'http://video2.spiegel.de/flash/' + filename
4302         video_ext = filename.rpartition('.')[2]
4303         info = {
4304             'id': video_id,
4305             'url': video_url,
4306             'ext': video_ext,
4307             'title': video_title,
4308             'duration': duration,
4309         }
4310         return [info]
4311
4312 class LiveLeakIE(InfoExtractor):
4313
4314     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4315     IE_NAME = u'liveleak'
4316
4317     def _real_extract(self, url):
4318         mobj = re.match(self._VALID_URL, url)
4319         if mobj is None:
4320             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4321             return
4322
4323         video_id = mobj.group('video_id')
4324
4325         webpage = self._download_webpage(url, video_id)
4326
4327         m = re.search(r'file: "(.*?)",', webpage)
4328         if not m:
4329             self._downloader.report_error(u'unable to find video url')
4330             return
4331         video_url = m.group(1)
4332
4333         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4334         if not m:
4335             self._downloader.trouble(u'Cannot find video title')
4336         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4337
4338         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4339         if m:
4340             desc = unescapeHTML(m.group('desc'))
4341         else:
4342             desc = None
4343
4344         m = re.search(r'By:.*?(\w+)</a>', webpage)
4345         if m:
4346             uploader = clean_html(m.group(1))
4347         else:
4348             uploader = None
4349
4350         info = {
4351             'id':  video_id,
4352             'url': video_url,
4353             'ext': 'mp4',
4354             'title': title,
4355             'description': desc,
4356             'uploader': uploader
4357         }
4358
4359         return [info]
4360
4361
4362 def gen_extractors():
4363     """ Return a list of an instance of every supported extractor.
4364     The order does matter; the first extractor matched is the one handling the URL.
4365     """
4366     return [
4367         YoutubePlaylistIE(),
4368         YoutubeChannelIE(),
4369         YoutubeUserIE(),
4370         YoutubeSearchIE(),
4371         YoutubeIE(),
4372         MetacafeIE(),
4373         DailymotionIE(),
4374         GoogleSearchIE(),
4375         PhotobucketIE(),
4376         YahooIE(),
4377         YahooSearchIE(),
4378         DepositFilesIE(),
4379         FacebookIE(),
4380         BlipTVUserIE(),
4381         BlipTVIE(),
4382         VimeoIE(),
4383         MyVideoIE(),
4384         ComedyCentralIE(),
4385         EscapistIE(),
4386         CollegeHumorIE(),
4387         XVideosIE(),
4388         SoundcloudSetIE(),
4389         SoundcloudIE(),
4390         InfoQIE(),
4391         MixcloudIE(),
4392         StanfordOpenClassroomIE(),
4393         MTVIE(),
4394         YoukuIE(),
4395         XNXXIE(),
4396         YouJizzIE(),
4397         PornotubeIE(),
4398         YouPornIE(),
4399         GooglePlusIE(),
4400         ArteTvIE(),
4401         NBAIE(),
4402         WorldStarHipHopIE(),
4403         JustinTVIE(),
4404         FunnyOrDieIE(),
4405         SteamIE(),
4406         UstreamIE(),
4407         RBMARadioIE(),
4408         EightTracksIE(),
4409         KeekIE(),
4410         TEDIE(),
4411         MySpassIE(),
4412         SpiegelIE(),
4413         LiveLeakIE(),
4414         GenericIE()
4415     ]