8caace3af4fe55618537de001841bb6fbe2aa350
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
228
229     def report_video_subtitles_request(self, video_id, sub_lang, format):
230         """Report attempt to download video info webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
232
233     def report_video_subtitles_available(self, video_id, sub_lang_list):
234         """Report available subtitles."""
235         sub_lang = ",".join(list(sub_lang_list.keys()))
236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
237
238     def report_information_extraction(self, video_id):
239         """Report attempt to extract video information."""
240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
241
242     def report_unavailable_format(self, video_id, format):
243         """Report extracted video URL."""
244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
245
246     def report_rtmp_download(self):
247         """Indicate the download will use the RTMP protocol."""
248         self._downloader.to_screen(u'[youtube] RTMP download detected')
249
250     def _get_available_subtitles(self, video_id):
251         self.report_video_subtitles_download(video_id)
252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253         try:
254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259         if not sub_lang_list:
260             return (u'WARNING: video doesn\'t have subtitles', None)
261         return sub_lang_list
262
263     def _list_available_subtitles(self, video_id):
264         sub_lang_list = self._get_available_subtitles(video_id)
265         self.report_video_subtitles_available(video_id, sub_lang_list)
266
267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268         """
269         Return tuple:
270         (error_message, sub_lang, sub)
271         """
272         self.report_video_subtitles_request(video_id, sub_lang, format)
273         params = compat_urllib_parse.urlencode({
274             'lang': sub_lang,
275             'name': sub_name,
276             'v': video_id,
277             'fmt': format,
278         })
279         url = 'http://www.youtube.com/api/timedtext?' + params
280         try:
281             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
282         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
283             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None, None)
284         if not sub:
285             return (u'WARNING: Did not fetch video subtitles', None, None)
286         return (None, sub_lang, sub)
287
288     def _extract_subtitle(self, video_id):
289         """
290         Return a list with a tuple:
291         [(error_message, sub_lang, sub)]
292         """
293         sub_lang_list = self._get_available_subtitles(video_id)
294         sub_format = self._downloader.params.get('subtitlesformat')
295         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
296             return [(sub_lang_list[0], None, None)]
297         if self._downloader.params.get('subtitleslang', False):
298             sub_lang = self._downloader.params.get('subtitleslang')
299         elif 'en' in sub_lang_list:
300             sub_lang = 'en'
301         else:
302             sub_lang = list(sub_lang_list.keys())[0]
303         if not sub_lang in sub_lang_list:
304             return [(u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None, None)]
305
306         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
307         return [subtitle]
308
309     def _extract_all_subtitles(self, video_id):
310         sub_lang_list = self._get_available_subtitles(video_id)
311         sub_format = self._downloader.params.get('subtitlesformat')
312         subtitles = []
313         for sub_lang in sub_lang_list:
314             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
315             subtitles.append(subtitle)
316         return subtitles
317
318     def _print_formats(self, formats):
319         print('Available formats:')
320         for x in formats:
321             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
322
323     def _real_initialize(self):
324         if self._downloader is None:
325             return
326
327         username = None
328         password = None
329         downloader_params = self._downloader.params
330
331         # Attempt to use provided username and password or .netrc data
332         if downloader_params.get('username', None) is not None:
333             username = downloader_params['username']
334             password = downloader_params['password']
335         elif downloader_params.get('usenetrc', False):
336             try:
337                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
338                 if info is not None:
339                     username = info[0]
340                     password = info[2]
341                 else:
342                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
343             except (IOError, netrc.NetrcParseError) as err:
344                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
345                 return
346
347         # Set language
348         request = compat_urllib_request.Request(self._LANG_URL)
349         try:
350             self.report_lang()
351             compat_urllib_request.urlopen(request).read()
352         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
353             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
354             return
355
356         # No authentication to be performed
357         if username is None:
358             return
359
360         request = compat_urllib_request.Request(self._LOGIN_URL)
361         try:
362             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
365             return
366
367         galx = None
368         dsh = None
369         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
370         if match:
371           galx = match.group(1)
372
373         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
374         if match:
375           dsh = match.group(1)
376
377         # Log in
378         login_form_strs = {
379                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
380                 u'Email': username,
381                 u'GALX': galx,
382                 u'Passwd': password,
383                 u'PersistentCookie': u'yes',
384                 u'_utf8': u'霱',
385                 u'bgresponse': u'js_disabled',
386                 u'checkConnection': u'',
387                 u'checkedDomains': u'youtube',
388                 u'dnConn': u'',
389                 u'dsh': dsh,
390                 u'pstMsg': u'0',
391                 u'rmShown': u'1',
392                 u'secTok': u'',
393                 u'signIn': u'Sign in',
394                 u'timeStmp': u'',
395                 u'service': u'youtube',
396                 u'uilel': u'3',
397                 u'hl': u'en_US',
398         }
399         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
400         # chokes on unicode
401         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
402         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
403         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
404         try:
405             self.report_login()
406             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
407             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
408                 self._downloader.report_warning(u'unable to log in: bad username or password')
409                 return
410         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
411             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
412             return
413
414         # Confirm age
415         age_form = {
416                 'next_url':     '/',
417                 'action_confirm':   'Confirm',
418                 }
419         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
420         try:
421             self.report_age_confirmation()
422             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
423         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
424             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
425             return
426
427     def _extract_id(self, url):
428         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
429         if mobj is None:
430             self._downloader.report_error(u'invalid URL: %s' % url)
431             return
432         video_id = mobj.group(2)
433         return video_id
434
435     def _real_extract(self, url):
436         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
437         mobj = re.search(self._NEXT_URL_RE, url)
438         if mobj:
439             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
440         video_id = self._extract_id(url)
441
442         # Get video webpage
443         self.report_video_webpage_download(video_id)
444         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
445         request = compat_urllib_request.Request(url)
446         try:
447             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
448         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
449             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
450             return
451
452         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
453
454         # Attempt to extract SWF player URL
455         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
456         if mobj is not None:
457             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
458         else:
459             player_url = None
460
461         # Get video info
462         self.report_video_info_webpage_download(video_id)
463         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
464             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
465                     % (video_id, el_type))
466             request = compat_urllib_request.Request(video_info_url)
467             try:
468                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
469                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
470                 video_info = compat_parse_qs(video_info_webpage)
471                 if 'token' in video_info:
472                     break
473             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
474                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
475                 return
476         if 'token' not in video_info:
477             if 'reason' in video_info:
478                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
479             else:
480                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
481             return
482
483         # Check for "rental" videos
484         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
485             self._downloader.report_error(u'"rental" videos not supported')
486             return
487
488         # Start extracting information
489         self.report_information_extraction(video_id)
490
491         # uploader
492         if 'author' not in video_info:
493             self._downloader.report_error(u'unable to extract uploader name')
494             return
495         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
496
497         # uploader_id
498         video_uploader_id = None
499         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
500         if mobj is not None:
501             video_uploader_id = mobj.group(1)
502         else:
503             self._downloader.report_warning(u'unable to extract uploader nickname')
504
505         # title
506         if 'title' not in video_info:
507             self._downloader.report_error(u'unable to extract video title')
508             return
509         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
510
511         # thumbnail image
512         if 'thumbnail_url' not in video_info:
513             self._downloader.report_warning(u'unable to extract video thumbnail')
514             video_thumbnail = ''
515         else:   # don't panic if we can't find it
516             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
517
518         # upload date
519         upload_date = None
520         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
521         if mobj is not None:
522             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
523             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
524             for expression in format_expressions:
525                 try:
526                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
527                 except:
528                     pass
529
530         # description
531         video_description = get_element_by_id("eow-description", video_webpage)
532         if video_description:
533             video_description = clean_html(video_description)
534         else:
535             video_description = ''
536
537         # subtitles
538         video_subtitles = None
539
540         if self._downloader.params.get('writesubtitles', False):
541             video_subtitles = self._extract_subtitle(video_id)
542             if video_subtitles:
543                 (sub_error, sub_lang, sub) = video_subtitles[0]
544                 if sub_error:
545                     self._downloader.trouble(sub_error)
546
547         if self._downloader.params.get('allsubtitles', False):
548             video_subtitles = self._extract_all_subtitles(video_id)
549             for video_subtitle in video_subtitles:
550                 (sub_error, sub_lang, sub) = video_subtitle
551                 if sub_error:
552                     self._downloader.trouble(sub_error)
553
554         if self._downloader.params.get('listsubtitles', False):
555             sub_lang_list = self._list_available_subtitles(video_id)
556             return
557
558         if 'length_seconds' not in video_info:
559             self._downloader.report_warning(u'unable to extract video duration')
560             video_duration = ''
561         else:
562             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
563
564         # token
565         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
566
567         # Decide which formats to download
568         req_format = self._downloader.params.get('format', None)
569
570         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
571             self.report_rtmp_download()
572             video_url_list = [(None, video_info['conn'][0])]
573         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
574             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
575             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
576             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
577             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
578
579             format_limit = self._downloader.params.get('format_limit', None)
580             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
581             if format_limit is not None and format_limit in available_formats:
582                 format_list = available_formats[available_formats.index(format_limit):]
583             else:
584                 format_list = available_formats
585             existing_formats = [x for x in format_list if x in url_map]
586             if len(existing_formats) == 0:
587                 self._downloader.report_error(u'no known formats available for video')
588                 return
589             if self._downloader.params.get('listformats', None):
590                 self._print_formats(existing_formats)
591                 return
592             if req_format is None or req_format == 'best':
593                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
594             elif req_format == 'worst':
595                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
596             elif req_format in ('-1', 'all'):
597                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
598             else:
599                 # Specific formats. We pick the first in a slash-delimeted sequence.
600                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
601                 req_formats = req_format.split('/')
602                 video_url_list = None
603                 for rf in req_formats:
604                     if rf in url_map:
605                         video_url_list = [(rf, url_map[rf])]
606                         break
607                 if video_url_list is None:
608                     self._downloader.report_error(u'requested format not available')
609                     return
610         else:
611             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
612             return
613
614         results = []
615         for format_param, video_real_url in video_url_list:
616             # Extension
617             video_extension = self._video_extensions.get(format_param, 'flv')
618
619             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
620                                               self._video_dimensions.get(format_param, '???'))
621
622             results.append({
623                 'id':       video_id,
624                 'url':      video_real_url,
625                 'uploader': video_uploader,
626                 'uploader_id': video_uploader_id,
627                 'upload_date':  upload_date,
628                 'title':    video_title,
629                 'ext':      video_extension,
630                 'format':   video_format,
631                 'thumbnail':    video_thumbnail,
632                 'description':  video_description,
633                 'player_url':   player_url,
634                 'subtitles':    video_subtitles,
635                 'duration':     video_duration
636             })
637         return results
638
639
640 class MetacafeIE(InfoExtractor):
641     """Information Extractor for metacafe.com."""
642
643     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
644     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
645     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
646     IE_NAME = u'metacafe'
647
648     def __init__(self, downloader=None):
649         InfoExtractor.__init__(self, downloader)
650
651     def report_disclaimer(self):
652         """Report disclaimer retrieval."""
653         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
654
655     def report_age_confirmation(self):
656         """Report attempt to confirm age."""
657         self._downloader.to_screen(u'[metacafe] Confirming age')
658
659     def report_download_webpage(self, video_id):
660         """Report webpage download."""
661         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
662
663     def report_extraction(self, video_id):
664         """Report information extraction."""
665         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
666
667     def _real_initialize(self):
668         # Retrieve disclaimer
669         request = compat_urllib_request.Request(self._DISCLAIMER)
670         try:
671             self.report_disclaimer()
672             disclaimer = compat_urllib_request.urlopen(request).read()
673         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
674             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
675             return
676
677         # Confirm age
678         disclaimer_form = {
679             'filters': '0',
680             'submit': "Continue - I'm over 18",
681             }
682         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
683         try:
684             self.report_age_confirmation()
685             disclaimer = compat_urllib_request.urlopen(request).read()
686         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
687             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
688             return
689
690     def _real_extract(self, url):
691         # Extract id and simplified title from URL
692         mobj = re.match(self._VALID_URL, url)
693         if mobj is None:
694             self._downloader.report_error(u'invalid URL: %s' % url)
695             return
696
697         video_id = mobj.group(1)
698
699         # Check if video comes from YouTube
700         mobj2 = re.match(r'^yt-(.*)$', video_id)
701         if mobj2 is not None:
702             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
703             return
704
705         # Retrieve video webpage to extract further information
706         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
707         try:
708             self.report_download_webpage(video_id)
709             webpage = compat_urllib_request.urlopen(request).read()
710         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
712             return
713
714         # Extract URL, uploader and title from webpage
715         self.report_extraction(video_id)
716         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
717         if mobj is not None:
718             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
719             video_extension = mediaURL[-3:]
720
721             # Extract gdaKey if available
722             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
723             if mobj is None:
724                 video_url = mediaURL
725             else:
726                 gdaKey = mobj.group(1)
727                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
728         else:
729             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
730             if mobj is None:
731                 self._downloader.report_error(u'unable to extract media URL')
732                 return
733             vardict = compat_parse_qs(mobj.group(1))
734             if 'mediaData' not in vardict:
735                 self._downloader.report_error(u'unable to extract media URL')
736                 return
737             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
738             if mobj is None:
739                 self._downloader.report_error(u'unable to extract media URL')
740                 return
741             mediaURL = mobj.group(1).replace('\\/', '/')
742             video_extension = mediaURL[-3:]
743             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
744
745         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
746         if mobj is None:
747             self._downloader.report_error(u'unable to extract title')
748             return
749         video_title = mobj.group(1).decode('utf-8')
750
751         mobj = re.search(r'submitter=(.*?);', webpage)
752         if mobj is None:
753             self._downloader.report_error(u'unable to extract uploader nickname')
754             return
755         video_uploader = mobj.group(1)
756
757         return [{
758             'id':       video_id.decode('utf-8'),
759             'url':      video_url.decode('utf-8'),
760             'uploader': video_uploader.decode('utf-8'),
761             'upload_date':  None,
762             'title':    video_title,
763             'ext':      video_extension.decode('utf-8'),
764         }]
765
766
767 class DailymotionIE(InfoExtractor):
768     """Information Extractor for Dailymotion"""
769
770     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
771     IE_NAME = u'dailymotion'
772     _WORKING = False
773
774     def __init__(self, downloader=None):
775         InfoExtractor.__init__(self, downloader)
776
777     def report_extraction(self, video_id):
778         """Report information extraction."""
779         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
780
781     def _real_extract(self, url):
782         # Extract id and simplified title from URL
783         mobj = re.match(self._VALID_URL, url)
784         if mobj is None:
785             self._downloader.report_error(u'invalid URL: %s' % url)
786             return
787
788         video_id = mobj.group(1).split('_')[0].split('?')[0]
789
790         video_extension = 'mp4'
791
792         # Retrieve video webpage to extract further information
793         request = compat_urllib_request.Request(url)
794         request.add_header('Cookie', 'family_filter=off')
795         webpage = self._download_webpage(request, video_id)
796
797         # Extract URL, uploader and title from webpage
798         self.report_extraction(video_id)
799         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
800         if mobj is None:
801             self._downloader.report_error(u'unable to extract media URL')
802             return
803         flashvars = compat_urllib_parse.unquote(mobj.group(1))
804
805         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
806             if key in flashvars:
807                 max_quality = key
808                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
809                 break
810         else:
811             self._downloader.report_error(u'unable to extract video URL')
812             return
813
814         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
815         if mobj is None:
816             self._downloader.report_error(u'unable to extract video URL')
817             return
818
819         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
820
821         # TODO: support choosing qualities
822
823         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
824         if mobj is None:
825             self._downloader.report_error(u'unable to extract title')
826             return
827         video_title = unescapeHTML(mobj.group('title'))
828
829         video_uploader = None
830         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
831         if mobj is None:
832             # lookin for official user
833             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
834             if mobj_official is None:
835                 self._downloader.report_warning(u'unable to extract uploader nickname')
836             else:
837                 video_uploader = mobj_official.group(1)
838         else:
839             video_uploader = mobj.group(1)
840
841         video_upload_date = None
842         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
843         if mobj is not None:
844             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
845
846         return [{
847             'id':       video_id,
848             'url':      video_url,
849             'uploader': video_uploader,
850             'upload_date':  video_upload_date,
851             'title':    video_title,
852             'ext':      video_extension,
853         }]
854
855
856 class PhotobucketIE(InfoExtractor):
857     """Information extractor for photobucket.com."""
858
859     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
860     IE_NAME = u'photobucket'
861
862     def __init__(self, downloader=None):
863         InfoExtractor.__init__(self, downloader)
864
865     def report_download_webpage(self, video_id):
866         """Report webpage download."""
867         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
868
869     def report_extraction(self, video_id):
870         """Report information extraction."""
871         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
872
873     def _real_extract(self, url):
874         # Extract id from URL
875         mobj = re.match(self._VALID_URL, url)
876         if mobj is None:
877             self._downloader.report_error(u'Invalid URL: %s' % url)
878             return
879
880         video_id = mobj.group(1)
881
882         video_extension = 'flv'
883
884         # Retrieve video webpage to extract further information
885         request = compat_urllib_request.Request(url)
886         try:
887             self.report_download_webpage(video_id)
888             webpage = compat_urllib_request.urlopen(request).read()
889         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
890             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
891             return
892
893         # Extract URL, uploader, and title from webpage
894         self.report_extraction(video_id)
895         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
896         if mobj is None:
897             self._downloader.report_error(u'unable to extract media URL')
898             return
899         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
900
901         video_url = mediaURL
902
903         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
904         if mobj is None:
905             self._downloader.report_error(u'unable to extract title')
906             return
907         video_title = mobj.group(1).decode('utf-8')
908
909         video_uploader = mobj.group(2).decode('utf-8')
910
911         return [{
912             'id':       video_id.decode('utf-8'),
913             'url':      video_url.decode('utf-8'),
914             'uploader': video_uploader,
915             'upload_date':  None,
916             'title':    video_title,
917             'ext':      video_extension.decode('utf-8'),
918         }]
919
920
921 class YahooIE(InfoExtractor):
922     """Information extractor for video.yahoo.com."""
923
924     _WORKING = False
925     # _VALID_URL matches all Yahoo! Video URLs
926     # _VPAGE_URL matches only the extractable '/watch/' URLs
927     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
928     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
929     IE_NAME = u'video.yahoo'
930
931     def __init__(self, downloader=None):
932         InfoExtractor.__init__(self, downloader)
933
934     def report_download_webpage(self, video_id):
935         """Report webpage download."""
936         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
937
938     def report_extraction(self, video_id):
939         """Report information extraction."""
940         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
941
942     def _real_extract(self, url, new_video=True):
943         # Extract ID from URL
944         mobj = re.match(self._VALID_URL, url)
945         if mobj is None:
946             self._downloader.report_error(u'Invalid URL: %s' % url)
947             return
948
949         video_id = mobj.group(2)
950         video_extension = 'flv'
951
952         # Rewrite valid but non-extractable URLs as
953         # extractable English language /watch/ URLs
954         if re.match(self._VPAGE_URL, url) is None:
955             request = compat_urllib_request.Request(url)
956             try:
957                 webpage = compat_urllib_request.urlopen(request).read()
958             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
959                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
960                 return
961
962             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
963             if mobj is None:
964                 self._downloader.report_error(u'Unable to extract id field')
965                 return
966             yahoo_id = mobj.group(1)
967
968             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
969             if mobj is None:
970                 self._downloader.report_error(u'Unable to extract vid field')
971                 return
972             yahoo_vid = mobj.group(1)
973
974             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
975             return self._real_extract(url, new_video=False)
976
977         # Retrieve video webpage to extract further information
978         request = compat_urllib_request.Request(url)
979         try:
980             self.report_download_webpage(video_id)
981             webpage = compat_urllib_request.urlopen(request).read()
982         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
983             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
984             return
985
986         # Extract uploader and title from webpage
987         self.report_extraction(video_id)
988         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
989         if mobj is None:
990             self._downloader.report_error(u'unable to extract video title')
991             return
992         video_title = mobj.group(1).decode('utf-8')
993
994         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
995         if mobj is None:
996             self._downloader.report_error(u'unable to extract video uploader')
997             return
998         video_uploader = mobj.group(1).decode('utf-8')
999
1000         # Extract video thumbnail
1001         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1002         if mobj is None:
1003             self._downloader.report_error(u'unable to extract video thumbnail')
1004             return
1005         video_thumbnail = mobj.group(1).decode('utf-8')
1006
1007         # Extract video description
1008         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1009         if mobj is None:
1010             self._downloader.report_error(u'unable to extract video description')
1011             return
1012         video_description = mobj.group(1).decode('utf-8')
1013         if not video_description:
1014             video_description = 'No description available.'
1015
1016         # Extract video height and width
1017         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video height')
1020             return
1021         yv_video_height = mobj.group(1)
1022
1023         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1024         if mobj is None:
1025             self._downloader.report_error(u'unable to extract video width')
1026             return
1027         yv_video_width = mobj.group(1)
1028
1029         # Retrieve video playlist to extract media URL
1030         # I'm not completely sure what all these options are, but we
1031         # seem to need most of them, otherwise the server sends a 401.
1032         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1033         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1034         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1035                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1036                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1037         try:
1038             self.report_download_webpage(video_id)
1039             webpage = compat_urllib_request.urlopen(request).read()
1040         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1041             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1042             return
1043
1044         # Extract media URL from playlist XML
1045         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1046         if mobj is None:
1047             self._downloader.report_error(u'Unable to extract media URL')
1048             return
1049         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1050         video_url = unescapeHTML(video_url)
1051
1052         return [{
1053             'id':       video_id.decode('utf-8'),
1054             'url':      video_url,
1055             'uploader': video_uploader,
1056             'upload_date':  None,
1057             'title':    video_title,
1058             'ext':      video_extension.decode('utf-8'),
1059             'thumbnail':    video_thumbnail.decode('utf-8'),
1060             'description':  video_description,
1061         }]
1062
1063
1064 class VimeoIE(InfoExtractor):
1065     """Information extractor for vimeo.com."""
1066
1067     # _VALID_URL matches Vimeo URLs
1068     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1069     IE_NAME = u'vimeo'
1070
1071     def __init__(self, downloader=None):
1072         InfoExtractor.__init__(self, downloader)
1073
1074     def report_download_webpage(self, video_id):
1075         """Report webpage download."""
1076         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1077
1078     def report_extraction(self, video_id):
1079         """Report information extraction."""
1080         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1081
1082     def _real_extract(self, url, new_video=True):
1083         # Extract ID from URL
1084         mobj = re.match(self._VALID_URL, url)
1085         if mobj is None:
1086             self._downloader.report_error(u'Invalid URL: %s' % url)
1087             return
1088
1089         video_id = mobj.group('id')
1090         if not mobj.group('proto'):
1091             url = 'https://' + url
1092         if mobj.group('direct_link'):
1093             url = 'https://vimeo.com/' + video_id
1094
1095         # Retrieve video webpage to extract further information
1096         request = compat_urllib_request.Request(url, None, std_headers)
1097         try:
1098             self.report_download_webpage(video_id)
1099             webpage_bytes = compat_urllib_request.urlopen(request).read()
1100             webpage = webpage_bytes.decode('utf-8')
1101         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1102             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1103             return
1104
1105         # Now we begin extracting as much information as we can from what we
1106         # retrieved. First we extract the information common to all extractors,
1107         # and latter we extract those that are Vimeo specific.
1108         self.report_extraction(video_id)
1109
1110         # Extract the config JSON
1111         try:
1112             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1113             config = json.loads(config)
1114         except:
1115             self._downloader.report_error(u'unable to extract info section')
1116             return
1117
1118         # Extract title
1119         video_title = config["video"]["title"]
1120
1121         # Extract uploader and uploader_id
1122         video_uploader = config["video"]["owner"]["name"]
1123         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1124
1125         # Extract video thumbnail
1126         video_thumbnail = config["video"]["thumbnail"]
1127
1128         # Extract video description
1129         video_description = get_element_by_attribute("itemprop", "description", webpage)
1130         if video_description: video_description = clean_html(video_description)
1131         else: video_description = ''
1132
1133         # Extract upload date
1134         video_upload_date = None
1135         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1136         if mobj is not None:
1137             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1138
1139         # Vimeo specific: extract request signature and timestamp
1140         sig = config['request']['signature']
1141         timestamp = config['request']['timestamp']
1142
1143         # Vimeo specific: extract video codec and quality information
1144         # First consider quality, then codecs, then take everything
1145         # TODO bind to format param
1146         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1147         files = { 'hd': [], 'sd': [], 'other': []}
1148         for codec_name, codec_extension in codecs:
1149             if codec_name in config["video"]["files"]:
1150                 if 'hd' in config["video"]["files"][codec_name]:
1151                     files['hd'].append((codec_name, codec_extension, 'hd'))
1152                 elif 'sd' in config["video"]["files"][codec_name]:
1153                     files['sd'].append((codec_name, codec_extension, 'sd'))
1154                 else:
1155                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1156
1157         for quality in ('hd', 'sd', 'other'):
1158             if len(files[quality]) > 0:
1159                 video_quality = files[quality][0][2]
1160                 video_codec = files[quality][0][0]
1161                 video_extension = files[quality][0][1]
1162                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1163                 break
1164         else:
1165             self._downloader.report_error(u'no known codec found')
1166             return
1167
1168         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1169                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1170
1171         return [{
1172             'id':       video_id,
1173             'url':      video_url,
1174             'uploader': video_uploader,
1175             'uploader_id': video_uploader_id,
1176             'upload_date':  video_upload_date,
1177             'title':    video_title,
1178             'ext':      video_extension,
1179             'thumbnail':    video_thumbnail,
1180             'description':  video_description,
1181         }]
1182
1183
1184 class ArteTvIE(InfoExtractor):
1185     """arte.tv information extractor."""
1186
1187     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1188     _LIVE_URL = r'index-[0-9]+\.html$'
1189
1190     IE_NAME = u'arte.tv'
1191
1192     def __init__(self, downloader=None):
1193         InfoExtractor.__init__(self, downloader)
1194
1195     def report_download_webpage(self, video_id):
1196         """Report webpage download."""
1197         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1198
1199     def report_extraction(self, video_id):
1200         """Report information extraction."""
1201         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1202
1203     def fetch_webpage(self, url):
1204         request = compat_urllib_request.Request(url)
1205         try:
1206             self.report_download_webpage(url)
1207             webpage = compat_urllib_request.urlopen(request).read()
1208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210             return
1211         except ValueError as err:
1212             self._downloader.report_error(u'Invalid URL: %s' % url)
1213             return
1214         return webpage
1215
1216     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1217         page = self.fetch_webpage(url)
1218         mobj = re.search(regex, page, regexFlags)
1219         info = {}
1220
1221         if mobj is None:
1222             self._downloader.report_error(u'Invalid URL: %s' % url)
1223             return
1224
1225         for (i, key, err) in matchTuples:
1226             if mobj.group(i) is None:
1227                 self._downloader.trouble(err)
1228                 return
1229             else:
1230                 info[key] = mobj.group(i)
1231
1232         return info
1233
1234     def extractLiveStream(self, url):
1235         video_lang = url.split('/')[-4]
1236         info = self.grep_webpage(
1237             url,
1238             r'src="(.*?/videothek_js.*?\.js)',
1239             0,
1240             [
1241                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1242             ]
1243         )
1244         http_host = url.split('/')[2]
1245         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1246         info = self.grep_webpage(
1247             next_url,
1248             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1249                 '(http://.*?\.swf).*?' +
1250                 '(rtmp://.*?)\'',
1251             re.DOTALL,
1252             [
1253                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1254                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1255                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1256             ]
1257         )
1258         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1259
1260     def extractPlus7Stream(self, url):
1261         video_lang = url.split('/')[-3]
1262         info = self.grep_webpage(
1263             url,
1264             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1265             0,
1266             [
1267                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1268             ]
1269         )
1270         next_url = compat_urllib_parse.unquote(info.get('url'))
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1274             0,
1275             [
1276                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1277             ]
1278         )
1279         next_url = compat_urllib_parse.unquote(info.get('url'))
1280
1281         info = self.grep_webpage(
1282             next_url,
1283             r'<video id="(.*?)".*?>.*?' +
1284                 '<name>(.*?)</name>.*?' +
1285                 '<dateVideo>(.*?)</dateVideo>.*?' +
1286                 '<url quality="hd">(.*?)</url>',
1287             re.DOTALL,
1288             [
1289                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1290                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1291                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1292                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1293             ]
1294         )
1295
1296         return {
1297             'id':           info.get('id'),
1298             'url':          compat_urllib_parse.unquote(info.get('url')),
1299             'uploader':     u'arte.tv',
1300             'upload_date':  info.get('date'),
1301             'title':        info.get('title').decode('utf-8'),
1302             'ext':          u'mp4',
1303             'format':       u'NA',
1304             'player_url':   None,
1305         }
1306
1307     def _real_extract(self, url):
1308         video_id = url.split('/')[-1]
1309         self.report_extraction(video_id)
1310
1311         if re.search(self._LIVE_URL, video_id) is not None:
1312             self.extractLiveStream(url)
1313             return
1314         else:
1315             info = self.extractPlus7Stream(url)
1316
1317         return [info]
1318
1319
1320 class GenericIE(InfoExtractor):
1321     """Generic last-resort information extractor."""
1322
1323     _VALID_URL = r'.*'
1324     IE_NAME = u'generic'
1325
1326     def __init__(self, downloader=None):
1327         InfoExtractor.__init__(self, downloader)
1328
1329     def report_download_webpage(self, video_id):
1330         """Report webpage download."""
1331         if not self._downloader.params.get('test', False):
1332             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1333         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1334
1335     def report_extraction(self, video_id):
1336         """Report information extraction."""
1337         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1338
1339     def report_following_redirect(self, new_url):
1340         """Report information extraction."""
1341         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1342
1343     def _test_redirect(self, url):
1344         """Check if it is a redirect, like url shorteners, in case restart chain."""
1345         class HeadRequest(compat_urllib_request.Request):
1346             def get_method(self):
1347                 return "HEAD"
1348
1349         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1350             """
1351             Subclass the HTTPRedirectHandler to make it use our
1352             HeadRequest also on the redirected URL
1353             """
1354             def redirect_request(self, req, fp, code, msg, headers, newurl):
1355                 if code in (301, 302, 303, 307):
1356                     newurl = newurl.replace(' ', '%20')
1357                     newheaders = dict((k,v) for k,v in req.headers.items()
1358                                       if k.lower() not in ("content-length", "content-type"))
1359                     return HeadRequest(newurl,
1360                                        headers=newheaders,
1361                                        origin_req_host=req.get_origin_req_host(),
1362                                        unverifiable=True)
1363                 else:
1364                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1365
1366         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1367             """
1368             Fallback to GET if HEAD is not allowed (405 HTTP error)
1369             """
1370             def http_error_405(self, req, fp, code, msg, headers):
1371                 fp.read()
1372                 fp.close()
1373
1374                 newheaders = dict((k,v) for k,v in req.headers.items()
1375                                   if k.lower() not in ("content-length", "content-type"))
1376                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1377                                                  headers=newheaders,
1378                                                  origin_req_host=req.get_origin_req_host(),
1379                                                  unverifiable=True))
1380
1381         # Build our opener
1382         opener = compat_urllib_request.OpenerDirector()
1383         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1384                         HTTPMethodFallback, HEADRedirectHandler,
1385                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1386             opener.add_handler(handler())
1387
1388         response = opener.open(HeadRequest(url))
1389         new_url = response.geturl()
1390
1391         if url == new_url:
1392             return False
1393
1394         self.report_following_redirect(new_url)
1395         self._downloader.download([new_url])
1396         return True
1397
1398     def _real_extract(self, url):
1399         if self._test_redirect(url): return
1400
1401         video_id = url.split('/')[-1]
1402         try:
1403             webpage = self._download_webpage(url, video_id)
1404         except ValueError as err:
1405             # since this is the last-resort InfoExtractor, if
1406             # this error is thrown, it'll be thrown here
1407             self._downloader.report_error(u'Invalid URL: %s' % url)
1408             return
1409
1410         self.report_extraction(video_id)
1411         # Start with something easy: JW Player in SWFObject
1412         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1413         if mobj is None:
1414             # Broaden the search a little bit
1415             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1416         if mobj is None:
1417             # Broaden the search a little bit: JWPlayer JS loader
1418             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1419         if mobj is None:
1420             self._downloader.report_error(u'Invalid URL: %s' % url)
1421             return
1422
1423         # It's possible that one of the regexes
1424         # matched, but returned an empty group:
1425         if mobj.group(1) is None:
1426             self._downloader.report_error(u'Invalid URL: %s' % url)
1427             return
1428
1429         video_url = compat_urllib_parse.unquote(mobj.group(1))
1430         video_id = os.path.basename(video_url)
1431
1432         # here's a fun little line of code for you:
1433         video_extension = os.path.splitext(video_id)[1][1:]
1434         video_id = os.path.splitext(video_id)[0]
1435
1436         # it's tempting to parse this further, but you would
1437         # have to take into account all the variations like
1438         #   Video Title - Site Name
1439         #   Site Name | Video Title
1440         #   Video Title - Tagline | Site Name
1441         # and so on and so forth; it's just not practical
1442         mobj = re.search(r'<title>(.*)</title>', webpage)
1443         if mobj is None:
1444             self._downloader.report_error(u'unable to extract title')
1445             return
1446         video_title = mobj.group(1)
1447
1448         # video uploader is domain name
1449         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1450         if mobj is None:
1451             self._downloader.report_error(u'unable to extract title')
1452             return
1453         video_uploader = mobj.group(1)
1454
1455         return [{
1456             'id':       video_id,
1457             'url':      video_url,
1458             'uploader': video_uploader,
1459             'upload_date':  None,
1460             'title':    video_title,
1461             'ext':      video_extension,
1462         }]
1463
1464
1465 class YoutubeSearchIE(InfoExtractor):
1466     """Information Extractor for YouTube search queries."""
1467     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1468     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1469     _max_youtube_results = 1000
1470     IE_NAME = u'youtube:search'
1471
1472     def __init__(self, downloader=None):
1473         InfoExtractor.__init__(self, downloader)
1474
1475     def report_download_page(self, query, pagenum):
1476         """Report attempt to download search page with given number."""
1477         query = query.decode(preferredencoding())
1478         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1479
1480     def _real_extract(self, query):
1481         mobj = re.match(self._VALID_URL, query)
1482         if mobj is None:
1483             self._downloader.report_error(u'invalid search query "%s"' % query)
1484             return
1485
1486         prefix, query = query.split(':')
1487         prefix = prefix[8:]
1488         query = query.encode('utf-8')
1489         if prefix == '':
1490             self._download_n_results(query, 1)
1491             return
1492         elif prefix == 'all':
1493             self._download_n_results(query, self._max_youtube_results)
1494             return
1495         else:
1496             try:
1497                 n = int(prefix)
1498                 if n <= 0:
1499                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1500                     return
1501                 elif n > self._max_youtube_results:
1502                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1503                     n = self._max_youtube_results
1504                 self._download_n_results(query, n)
1505                 return
1506             except ValueError: # parsing prefix as integer fails
1507                 self._download_n_results(query, 1)
1508                 return
1509
1510     def _download_n_results(self, query, n):
1511         """Downloads a specified number of results for a query"""
1512
1513         video_ids = []
1514         pagenum = 0
1515         limit = n
1516
1517         while (50 * pagenum) < limit:
1518             self.report_download_page(query, pagenum+1)
1519             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1520             request = compat_urllib_request.Request(result_url)
1521             try:
1522                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1523             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1524                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1525                 return
1526             api_response = json.loads(data)['data']
1527
1528             if not 'items' in api_response:
1529                 self._downloader.trouble(u'[youtube] No video results')
1530                 return
1531
1532             new_ids = list(video['id'] for video in api_response['items'])
1533             video_ids += new_ids
1534
1535             limit = min(n, api_response['totalItems'])
1536             pagenum += 1
1537
1538         if len(video_ids) > n:
1539             video_ids = video_ids[:n]
1540         for id in video_ids:
1541             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1542         return
1543
1544
1545 class GoogleSearchIE(InfoExtractor):
1546     """Information Extractor for Google Video search queries."""
1547     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1548     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1549     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1550     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1551     _max_google_results = 1000
1552     IE_NAME = u'video.google:search'
1553
1554     def __init__(self, downloader=None):
1555         InfoExtractor.__init__(self, downloader)
1556
1557     def report_download_page(self, query, pagenum):
1558         """Report attempt to download playlist page with given number."""
1559         query = query.decode(preferredencoding())
1560         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1561
1562     def _real_extract(self, query):
1563         mobj = re.match(self._VALID_URL, query)
1564         if mobj is None:
1565             self._downloader.report_error(u'invalid search query "%s"' % query)
1566             return
1567
1568         prefix, query = query.split(':')
1569         prefix = prefix[8:]
1570         query = query.encode('utf-8')
1571         if prefix == '':
1572             self._download_n_results(query, 1)
1573             return
1574         elif prefix == 'all':
1575             self._download_n_results(query, self._max_google_results)
1576             return
1577         else:
1578             try:
1579                 n = int(prefix)
1580                 if n <= 0:
1581                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1582                     return
1583                 elif n > self._max_google_results:
1584                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1585                     n = self._max_google_results
1586                 self._download_n_results(query, n)
1587                 return
1588             except ValueError: # parsing prefix as integer fails
1589                 self._download_n_results(query, 1)
1590                 return
1591
1592     def _download_n_results(self, query, n):
1593         """Downloads a specified number of results for a query"""
1594
1595         video_ids = []
1596         pagenum = 0
1597
1598         while True:
1599             self.report_download_page(query, pagenum)
1600             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1601             request = compat_urllib_request.Request(result_url)
1602             try:
1603                 page = compat_urllib_request.urlopen(request).read()
1604             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1605                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1606                 return
1607
1608             # Extract video identifiers
1609             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1610                 video_id = mobj.group(1)
1611                 if video_id not in video_ids:
1612                     video_ids.append(video_id)
1613                     if len(video_ids) == n:
1614                         # Specified n videos reached
1615                         for id in video_ids:
1616                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1617                         return
1618
1619             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1620                 for id in video_ids:
1621                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1622                 return
1623
1624             pagenum = pagenum + 1
1625
1626
1627 class YahooSearchIE(InfoExtractor):
1628     """Information Extractor for Yahoo! Video search queries."""
1629
1630     _WORKING = False
1631     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1632     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1633     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1634     _MORE_PAGES_INDICATOR = r'\s*Next'
1635     _max_yahoo_results = 1000
1636     IE_NAME = u'video.yahoo:search'
1637
1638     def __init__(self, downloader=None):
1639         InfoExtractor.__init__(self, downloader)
1640
1641     def report_download_page(self, query, pagenum):
1642         """Report attempt to download playlist page with given number."""
1643         query = query.decode(preferredencoding())
1644         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1645
1646     def _real_extract(self, query):
1647         mobj = re.match(self._VALID_URL, query)
1648         if mobj is None:
1649             self._downloader.report_error(u'invalid search query "%s"' % query)
1650             return
1651
1652         prefix, query = query.split(':')
1653         prefix = prefix[8:]
1654         query = query.encode('utf-8')
1655         if prefix == '':
1656             self._download_n_results(query, 1)
1657             return
1658         elif prefix == 'all':
1659             self._download_n_results(query, self._max_yahoo_results)
1660             return
1661         else:
1662             try:
1663                 n = int(prefix)
1664                 if n <= 0:
1665                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1666                     return
1667                 elif n > self._max_yahoo_results:
1668                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1669                     n = self._max_yahoo_results
1670                 self._download_n_results(query, n)
1671                 return
1672             except ValueError: # parsing prefix as integer fails
1673                 self._download_n_results(query, 1)
1674                 return
1675
1676     def _download_n_results(self, query, n):
1677         """Downloads a specified number of results for a query"""
1678
1679         video_ids = []
1680         already_seen = set()
1681         pagenum = 1
1682
1683         while True:
1684             self.report_download_page(query, pagenum)
1685             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1686             request = compat_urllib_request.Request(result_url)
1687             try:
1688                 page = compat_urllib_request.urlopen(request).read()
1689             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1690                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1691                 return
1692
1693             # Extract video identifiers
1694             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1695                 video_id = mobj.group(1)
1696                 if video_id not in already_seen:
1697                     video_ids.append(video_id)
1698                     already_seen.add(video_id)
1699                     if len(video_ids) == n:
1700                         # Specified n videos reached
1701                         for id in video_ids:
1702                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1703                         return
1704
1705             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1706                 for id in video_ids:
1707                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1708                 return
1709
1710             pagenum = pagenum + 1
1711
1712
1713 class YoutubePlaylistIE(InfoExtractor):
1714     """Information Extractor for YouTube playlists."""
1715
1716     _VALID_URL = r"""(?:
1717                         (?:https?://)?
1718                         (?:\w+\.)?
1719                         youtube\.com/
1720                         (?:
1721                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1722                            \? (?:.*?&)*? (?:p|a|list)=
1723                         |  user/.*?/user/
1724                         |  p/
1725                         |  user/.*?#[pg]/c/
1726                         )
1727                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1728                         .*
1729                      |
1730                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1731                      )"""
1732     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1733     _MAX_RESULTS = 50
1734     IE_NAME = u'youtube:playlist'
1735
1736     def __init__(self, downloader=None):
1737         InfoExtractor.__init__(self, downloader)
1738
1739     @classmethod
1740     def suitable(cls, url):
1741         """Receives a URL and returns True if suitable for this IE."""
1742         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1743
1744     def report_download_page(self, playlist_id, pagenum):
1745         """Report attempt to download playlist page with given number."""
1746         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1747
1748     def _real_extract(self, url):
1749         # Extract playlist id
1750         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1751         if mobj is None:
1752             self._downloader.report_error(u'invalid url: %s' % url)
1753             return
1754
1755         # Download playlist videos from API
1756         playlist_id = mobj.group(1) or mobj.group(2)
1757         page_num = 1
1758         videos = []
1759
1760         while True:
1761             self.report_download_page(playlist_id, page_num)
1762
1763             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1764             try:
1765                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1766             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1767                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1768                 return
1769
1770             try:
1771                 response = json.loads(page)
1772             except ValueError as err:
1773                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1774                 return
1775
1776             if not 'feed' in response or not 'entry' in response['feed']:
1777                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1778                 return
1779             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1780                         for entry in response['feed']['entry']
1781                         if 'content' in entry ]
1782
1783             if len(response['feed']['entry']) < self._MAX_RESULTS:
1784                 break
1785             page_num += 1
1786
1787         videos = [v[1] for v in sorted(videos)]
1788         total = len(videos)
1789
1790         playliststart = self._downloader.params.get('playliststart', 1) - 1
1791         playlistend = self._downloader.params.get('playlistend', -1)
1792         if playlistend == -1:
1793             videos = videos[playliststart:]
1794         else:
1795             videos = videos[playliststart:playlistend]
1796
1797         if len(videos) == total:
1798             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1799         else:
1800             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1801
1802         for video in videos:
1803             self._downloader.download([video])
1804         return
1805
1806
1807 class YoutubeChannelIE(InfoExtractor):
1808     """Information Extractor for YouTube channels."""
1809
1810     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1811     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1812     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1813     IE_NAME = u'youtube:channel'
1814
1815     def report_download_page(self, channel_id, pagenum):
1816         """Report attempt to download channel page with given number."""
1817         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1818
1819     def _real_extract(self, url):
1820         # Extract channel id
1821         mobj = re.match(self._VALID_URL, url)
1822         if mobj is None:
1823             self._downloader.report_error(u'invalid url: %s' % url)
1824             return
1825
1826         # Download channel pages
1827         channel_id = mobj.group(1)
1828         video_ids = []
1829         pagenum = 1
1830
1831         while True:
1832             self.report_download_page(channel_id, pagenum)
1833             url = self._TEMPLATE_URL % (channel_id, pagenum)
1834             request = compat_urllib_request.Request(url)
1835             try:
1836                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1839                 return
1840
1841             # Extract video identifiers
1842             ids_in_page = []
1843             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1844                 if mobj.group(1) not in ids_in_page:
1845                     ids_in_page.append(mobj.group(1))
1846             video_ids.extend(ids_in_page)
1847
1848             if self._MORE_PAGES_INDICATOR not in page:
1849                 break
1850             pagenum = pagenum + 1
1851
1852         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1853
1854         for id in video_ids:
1855             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1856         return
1857
1858
1859 class YoutubeUserIE(InfoExtractor):
1860     """Information Extractor for YouTube users."""
1861
1862     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1863     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1864     _GDATA_PAGE_SIZE = 50
1865     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1866     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1867     IE_NAME = u'youtube:user'
1868
1869     def __init__(self, downloader=None):
1870         InfoExtractor.__init__(self, downloader)
1871
1872     def report_download_page(self, username, start_index):
1873         """Report attempt to download user page."""
1874         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1875                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1876
1877     def _real_extract(self, url):
1878         # Extract username
1879         mobj = re.match(self._VALID_URL, url)
1880         if mobj is None:
1881             self._downloader.report_error(u'invalid url: %s' % url)
1882             return
1883
1884         username = mobj.group(1)
1885
1886         # Download video ids using YouTube Data API. Result size per
1887         # query is limited (currently to 50 videos) so we need to query
1888         # page by page until there are no video ids - it means we got
1889         # all of them.
1890
1891         video_ids = []
1892         pagenum = 0
1893
1894         while True:
1895             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1896             self.report_download_page(username, start_index)
1897
1898             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1899
1900             try:
1901                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1902             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1903                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1904                 return
1905
1906             # Extract video identifiers
1907             ids_in_page = []
1908
1909             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1910                 if mobj.group(1) not in ids_in_page:
1911                     ids_in_page.append(mobj.group(1))
1912
1913             video_ids.extend(ids_in_page)
1914
1915             # A little optimization - if current page is not
1916             # "full", ie. does not contain PAGE_SIZE video ids then
1917             # we can assume that this page is the last one - there
1918             # are no more ids on further pages - no need to query
1919             # again.
1920
1921             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1922                 break
1923
1924             pagenum += 1
1925
1926         all_ids_count = len(video_ids)
1927         playliststart = self._downloader.params.get('playliststart', 1) - 1
1928         playlistend = self._downloader.params.get('playlistend', -1)
1929
1930         if playlistend == -1:
1931             video_ids = video_ids[playliststart:]
1932         else:
1933             video_ids = video_ids[playliststart:playlistend]
1934
1935         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1936                 (username, all_ids_count, len(video_ids)))
1937
1938         for video_id in video_ids:
1939             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1940
1941
1942 class BlipTVUserIE(InfoExtractor):
1943     """Information Extractor for blip.tv users."""
1944
1945     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1946     _PAGE_SIZE = 12
1947     IE_NAME = u'blip.tv:user'
1948
1949     def __init__(self, downloader=None):
1950         InfoExtractor.__init__(self, downloader)
1951
1952     def report_download_page(self, username, pagenum):
1953         """Report attempt to download user page."""
1954         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1955                 (self.IE_NAME, username, pagenum))
1956
1957     def _real_extract(self, url):
1958         # Extract username
1959         mobj = re.match(self._VALID_URL, url)
1960         if mobj is None:
1961             self._downloader.report_error(u'invalid url: %s' % url)
1962             return
1963
1964         username = mobj.group(1)
1965
1966         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1967
1968         request = compat_urllib_request.Request(url)
1969
1970         try:
1971             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1972             mobj = re.search(r'data-users-id="([^"]+)"', page)
1973             page_base = page_base % mobj.group(1)
1974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1975             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1976             return
1977
1978
1979         # Download video ids using BlipTV Ajax calls. Result size per
1980         # query is limited (currently to 12 videos) so we need to query
1981         # page by page until there are no video ids - it means we got
1982         # all of them.
1983
1984         video_ids = []
1985         pagenum = 1
1986
1987         while True:
1988             self.report_download_page(username, pagenum)
1989             url = page_base + "&page=" + str(pagenum)
1990             request = compat_urllib_request.Request( url )
1991             try:
1992                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1993             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1995                 return
1996
1997             # Extract video identifiers
1998             ids_in_page = []
1999
2000             for mobj in re.finditer(r'href="/([^"]+)"', page):
2001                 if mobj.group(1) not in ids_in_page:
2002                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2003
2004             video_ids.extend(ids_in_page)
2005
2006             # A little optimization - if current page is not
2007             # "full", ie. does not contain PAGE_SIZE video ids then
2008             # we can assume that this page is the last one - there
2009             # are no more ids on further pages - no need to query
2010             # again.
2011
2012             if len(ids_in_page) < self._PAGE_SIZE:
2013                 break
2014
2015             pagenum += 1
2016
2017         all_ids_count = len(video_ids)
2018         playliststart = self._downloader.params.get('playliststart', 1) - 1
2019         playlistend = self._downloader.params.get('playlistend', -1)
2020
2021         if playlistend == -1:
2022             video_ids = video_ids[playliststart:]
2023         else:
2024             video_ids = video_ids[playliststart:playlistend]
2025
2026         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2027                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2028
2029         for video_id in video_ids:
2030             self._downloader.download([u'http://blip.tv/'+video_id])
2031
2032
2033 class DepositFilesIE(InfoExtractor):
2034     """Information extractor for depositfiles.com"""
2035
2036     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2037
2038     def report_download_webpage(self, file_id):
2039         """Report webpage download."""
2040         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2041
2042     def report_extraction(self, file_id):
2043         """Report information extraction."""
2044         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2045
2046     def _real_extract(self, url):
2047         file_id = url.split('/')[-1]
2048         # Rebuild url in english locale
2049         url = 'http://depositfiles.com/en/files/' + file_id
2050
2051         # Retrieve file webpage with 'Free download' button pressed
2052         free_download_indication = { 'gateway_result' : '1' }
2053         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2054         try:
2055             self.report_download_webpage(file_id)
2056             webpage = compat_urllib_request.urlopen(request).read()
2057         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2058             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2059             return
2060
2061         # Search for the real file URL
2062         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2063         if (mobj is None) or (mobj.group(1) is None):
2064             # Try to figure out reason of the error.
2065             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2066             if (mobj is not None) and (mobj.group(1) is not None):
2067                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2068                 self._downloader.report_error(u'%s' % restriction_message)
2069             else:
2070                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2071             return
2072
2073         file_url = mobj.group(1)
2074         file_extension = os.path.splitext(file_url)[1][1:]
2075
2076         # Search for file title
2077         mobj = re.search(r'<b title="(.*?)">', webpage)
2078         if mobj is None:
2079             self._downloader.report_error(u'unable to extract title')
2080             return
2081         file_title = mobj.group(1).decode('utf-8')
2082
2083         return [{
2084             'id':       file_id.decode('utf-8'),
2085             'url':      file_url.decode('utf-8'),
2086             'uploader': None,
2087             'upload_date':  None,
2088             'title':    file_title,
2089             'ext':      file_extension.decode('utf-8'),
2090         }]
2091
2092
2093 class FacebookIE(InfoExtractor):
2094     """Information Extractor for Facebook"""
2095
2096     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2097     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2098     _NETRC_MACHINE = 'facebook'
2099     IE_NAME = u'facebook'
2100
2101     def report_login(self):
2102         """Report attempt to log in."""
2103         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2104
2105     def _real_initialize(self):
2106         if self._downloader is None:
2107             return
2108
2109         useremail = None
2110         password = None
2111         downloader_params = self._downloader.params
2112
2113         # Attempt to use provided username and password or .netrc data
2114         if downloader_params.get('username', None) is not None:
2115             useremail = downloader_params['username']
2116             password = downloader_params['password']
2117         elif downloader_params.get('usenetrc', False):
2118             try:
2119                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2120                 if info is not None:
2121                     useremail = info[0]
2122                     password = info[2]
2123                 else:
2124                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2125             except (IOError, netrc.NetrcParseError) as err:
2126                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2127                 return
2128
2129         if useremail is None:
2130             return
2131
2132         # Log in
2133         login_form = {
2134             'email': useremail,
2135             'pass': password,
2136             'login': 'Log+In'
2137             }
2138         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2139         try:
2140             self.report_login()
2141             login_results = compat_urllib_request.urlopen(request).read()
2142             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2143                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2144                 return
2145         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2146             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2147             return
2148
2149     def _real_extract(self, url):
2150         mobj = re.match(self._VALID_URL, url)
2151         if mobj is None:
2152             self._downloader.report_error(u'invalid URL: %s' % url)
2153             return
2154         video_id = mobj.group('ID')
2155
2156         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2157         webpage = self._download_webpage(url, video_id)
2158
2159         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2160         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2161         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2162         if not m:
2163             raise ExtractorError(u'Cannot parse data')
2164         data = dict(json.loads(m.group(1)))
2165         params_raw = compat_urllib_parse.unquote(data['params'])
2166         params = json.loads(params_raw)
2167         video_url = params['hd_src']
2168         if not video_url:
2169             video_url = params['sd_src']
2170         if not video_url:
2171             raise ExtractorError(u'Cannot find video URL')
2172         video_duration = int(params['video_duration'])
2173
2174         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2175         if not m:
2176             raise ExtractorError(u'Cannot find title in webpage')
2177         video_title = unescapeHTML(m.group(1))
2178
2179         info = {
2180             'id': video_id,
2181             'title': video_title,
2182             'url': video_url,
2183             'ext': 'mp4',
2184             'duration': video_duration,
2185             'thumbnail': params['thumbnail_src'],
2186         }
2187         return [info]
2188
2189
2190 class BlipTVIE(InfoExtractor):
2191     """Information extractor for blip.tv"""
2192
2193     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195     IE_NAME = u'blip.tv'
2196
2197     def report_extraction(self, file_id):
2198         """Report information extraction."""
2199         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200
2201     def report_direct_download(self, title):
2202         """Report information extraction."""
2203         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204
2205     def _real_extract(self, url):
2206         mobj = re.match(self._VALID_URL, url)
2207         if mobj is None:
2208             self._downloader.report_error(u'invalid URL: %s' % url)
2209             return
2210
2211         urlp = compat_urllib_parse_urlparse(url)
2212         if urlp.path.startswith('/play/'):
2213             request = compat_urllib_request.Request(url)
2214             response = compat_urllib_request.urlopen(request)
2215             redirecturl = response.geturl()
2216             rurlp = compat_urllib_parse_urlparse(redirecturl)
2217             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2218             url = 'http://blip.tv/a/a-' + file_id
2219             return self._real_extract(url)
2220
2221
2222         if '?' in url:
2223             cchar = '&'
2224         else:
2225             cchar = '?'
2226         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2227         request = compat_urllib_request.Request(json_url)
2228         request.add_header('User-Agent', 'iTunes/10.6.1')
2229         self.report_extraction(mobj.group(1))
2230         info = None
2231         try:
2232             urlh = compat_urllib_request.urlopen(request)
2233             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2234                 basename = url.split('/')[-1]
2235                 title,ext = os.path.splitext(basename)
2236                 title = title.decode('UTF-8')
2237                 ext = ext.replace('.', '')
2238                 self.report_direct_download(title)
2239                 info = {
2240                     'id': title,
2241                     'url': url,
2242                     'uploader': None,
2243                     'upload_date': None,
2244                     'title': title,
2245                     'ext': ext,
2246                     'urlhandle': urlh
2247                 }
2248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2249             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2250         if info is None: # Regular URL
2251             try:
2252                 json_code_bytes = urlh.read()
2253                 json_code = json_code_bytes.decode('utf-8')
2254             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2255                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2256                 return
2257
2258             try:
2259                 json_data = json.loads(json_code)
2260                 if 'Post' in json_data:
2261                     data = json_data['Post']
2262                 else:
2263                     data = json_data
2264
2265                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2266                 video_url = data['media']['url']
2267                 umobj = re.match(self._URL_EXT, video_url)
2268                 if umobj is None:
2269                     raise ValueError('Can not determine filename extension')
2270                 ext = umobj.group(1)
2271
2272                 info = {
2273                     'id': data['item_id'],
2274                     'url': video_url,
2275                     'uploader': data['display_name'],
2276                     'upload_date': upload_date,
2277                     'title': data['title'],
2278                     'ext': ext,
2279                     'format': data['media']['mimeType'],
2280                     'thumbnail': data['thumbnailUrl'],
2281                     'description': data['description'],
2282                     'player_url': data['embedUrl'],
2283                     'user_agent': 'iTunes/10.6.1',
2284                 }
2285             except (ValueError,KeyError) as err:
2286                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2287                 return
2288
2289         return [info]
2290
2291
2292 class MyVideoIE(InfoExtractor):
2293     """Information Extractor for myvideo.de."""
2294
2295     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2296     IE_NAME = u'myvideo'
2297
2298     def __init__(self, downloader=None):
2299         InfoExtractor.__init__(self, downloader)
2300
2301     def report_extraction(self, video_id):
2302         """Report information extraction."""
2303         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2304
2305     def _real_extract(self,url):
2306         mobj = re.match(self._VALID_URL, url)
2307         if mobj is None:
2308             self._download.report_error(u'invalid URL: %s' % url)
2309             return
2310
2311         video_id = mobj.group(1)
2312
2313         # Get video webpage
2314         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2315         webpage = self._download_webpage(webpage_url, video_id)
2316
2317         self.report_extraction(video_id)
2318         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2319                  webpage)
2320         if mobj is None:
2321             self._downloader.report_error(u'unable to extract media URL')
2322             return
2323         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2324
2325         mobj = re.search('<title>([^<]+)</title>', webpage)
2326         if mobj is None:
2327             self._downloader.report_error(u'unable to extract title')
2328             return
2329
2330         video_title = mobj.group(1)
2331
2332         return [{
2333             'id':       video_id,
2334             'url':      video_url,
2335             'uploader': None,
2336             'upload_date':  None,
2337             'title':    video_title,
2338             'ext':      u'flv',
2339         }]
2340
2341 class ComedyCentralIE(InfoExtractor):
2342     """Information extractor for The Daily Show and Colbert Report """
2343
2344     # urls can be abbreviations like :thedailyshow or :colbert
2345     # urls for episodes like:
2346     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2347     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2348     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2349     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2350                       |(https?://)?(www\.)?
2351                           (?P<showname>thedailyshow|colbertnation)\.com/
2352                          (full-episodes/(?P<episode>.*)|
2353                           (?P<clip>
2354                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2355                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2356                      $"""
2357
2358     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2359
2360     _video_extensions = {
2361         '3500': 'mp4',
2362         '2200': 'mp4',
2363         '1700': 'mp4',
2364         '1200': 'mp4',
2365         '750': 'mp4',
2366         '400': 'mp4',
2367     }
2368     _video_dimensions = {
2369         '3500': '1280x720',
2370         '2200': '960x540',
2371         '1700': '768x432',
2372         '1200': '640x360',
2373         '750': '512x288',
2374         '400': '384x216',
2375     }
2376
2377     @classmethod
2378     def suitable(cls, url):
2379         """Receives a URL and returns True if suitable for this IE."""
2380         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2381
2382     def report_extraction(self, episode_id):
2383         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2384
2385     def report_config_download(self, episode_id, media_id):
2386         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2387
2388     def report_index_download(self, episode_id):
2389         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2390
2391     def _print_formats(self, formats):
2392         print('Available formats:')
2393         for x in formats:
2394             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2395
2396
2397     def _real_extract(self, url):
2398         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2399         if mobj is None:
2400             self._downloader.report_error(u'invalid URL: %s' % url)
2401             return
2402
2403         if mobj.group('shortname'):
2404             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2405                 url = u'http://www.thedailyshow.com/full-episodes/'
2406             else:
2407                 url = u'http://www.colbertnation.com/full-episodes/'
2408             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409             assert mobj is not None
2410
2411         if mobj.group('clip'):
2412             if mobj.group('showname') == 'thedailyshow':
2413                 epTitle = mobj.group('tdstitle')
2414             else:
2415                 epTitle = mobj.group('cntitle')
2416             dlNewest = False
2417         else:
2418             dlNewest = not mobj.group('episode')
2419             if dlNewest:
2420                 epTitle = mobj.group('showname')
2421             else:
2422                 epTitle = mobj.group('episode')
2423
2424         req = compat_urllib_request.Request(url)
2425         self.report_extraction(epTitle)
2426         try:
2427             htmlHandle = compat_urllib_request.urlopen(req)
2428             html = htmlHandle.read()
2429             webpage = html.decode('utf-8')
2430         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2431             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2432             return
2433         if dlNewest:
2434             url = htmlHandle.geturl()
2435             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2436             if mobj is None:
2437                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2438                 return
2439             if mobj.group('episode') == '':
2440                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2441                 return
2442             epTitle = mobj.group('episode')
2443
2444         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2445
2446         if len(mMovieParams) == 0:
2447             # The Colbert Report embeds the information in a without
2448             # a URL prefix; so extract the alternate reference
2449             # and then add the URL prefix manually.
2450
2451             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2452             if len(altMovieParams) == 0:
2453                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2454                 return
2455             else:
2456                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2457
2458         uri = mMovieParams[0][1]
2459         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460         self.report_index_download(epTitle)
2461         try:
2462             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2465             return
2466
2467         results = []
2468
2469         idoc = xml.etree.ElementTree.fromstring(indexXml)
2470         itemEls = idoc.findall('.//item')
2471         for partNum,itemEl in enumerate(itemEls):
2472             mediaId = itemEl.findall('./guid')[0].text
2473             shortMediaId = mediaId.split(':')[-1]
2474             showId = mediaId.split(':')[-2].replace('.com', '')
2475             officialTitle = itemEl.findall('./title')[0].text
2476             officialDate = itemEl.findall('./pubDate')[0].text
2477
2478             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479                         compat_urllib_parse.urlencode({'uri': mediaId}))
2480             configReq = compat_urllib_request.Request(configUrl)
2481             self.report_config_download(epTitle, shortMediaId)
2482             try:
2483                 configXml = compat_urllib_request.urlopen(configReq).read()
2484             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2486                 return
2487
2488             cdoc = xml.etree.ElementTree.fromstring(configXml)
2489             turls = []
2490             for rendition in cdoc.findall('.//rendition'):
2491                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2492                 turls.append(finfo)
2493
2494             if len(turls) == 0:
2495                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2496                 continue
2497
2498             if self._downloader.params.get('listformats', None):
2499                 self._print_formats([i[0] for i in turls])
2500                 return
2501
2502             # For now, just pick the highest bitrate
2503             format,rtmp_video_url = turls[-1]
2504
2505             # Get the format arg from the arg stream
2506             req_format = self._downloader.params.get('format', None)
2507
2508             # Select format if we can find one
2509             for f,v in turls:
2510                 if f == req_format:
2511                     format, rtmp_video_url = f, v
2512                     break
2513
2514             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2515             if not m:
2516                 raise ExtractorError(u'Cannot transform RTMP url')
2517             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2518             video_url = base + m.group('finalid')
2519
2520             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2521             info = {
2522                 'id': shortMediaId,
2523                 'url': video_url,
2524                 'uploader': showId,
2525                 'upload_date': officialDate,
2526                 'title': effTitle,
2527                 'ext': 'mp4',
2528                 'format': format,
2529                 'thumbnail': None,
2530                 'description': officialTitle,
2531             }
2532             results.append(info)
2533
2534         return results
2535
2536
2537 class EscapistIE(InfoExtractor):
2538     """Information extractor for The Escapist """
2539
2540     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2541     IE_NAME = u'escapist'
2542
2543     def report_extraction(self, showName):
2544         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2545
2546     def report_config_download(self, showName):
2547         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2548
2549     def _real_extract(self, url):
2550         mobj = re.match(self._VALID_URL, url)
2551         if mobj is None:
2552             self._downloader.report_error(u'invalid URL: %s' % url)
2553             return
2554         showName = mobj.group('showname')
2555         videoId = mobj.group('episode')
2556
2557         self.report_extraction(showName)
2558         try:
2559             webPage = compat_urllib_request.urlopen(url)
2560             webPageBytes = webPage.read()
2561             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2562             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2563         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2564             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2565             return
2566
2567         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2568         description = unescapeHTML(descMatch.group(1))
2569         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2570         imgUrl = unescapeHTML(imgMatch.group(1))
2571         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2572         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2573         configUrlMatch = re.search('config=(.*)$', playerUrl)
2574         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2575
2576         self.report_config_download(showName)
2577         try:
2578             configJSON = compat_urllib_request.urlopen(configUrl)
2579             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2580             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2583             return
2584
2585         # Technically, it's JavaScript, not JSON
2586         configJSON = configJSON.replace("'", '"')
2587
2588         try:
2589             config = json.loads(configJSON)
2590         except (ValueError,) as err:
2591             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2592             return
2593
2594         playlist = config['playlist']
2595         videoUrl = playlist[1]['url']
2596
2597         info = {
2598             'id': videoId,
2599             'url': videoUrl,
2600             'uploader': showName,
2601             'upload_date': None,
2602             'title': showName,
2603             'ext': 'mp4',
2604             'thumbnail': imgUrl,
2605             'description': description,
2606             'player_url': playerUrl,
2607         }
2608
2609         return [info]
2610
2611 class CollegeHumorIE(InfoExtractor):
2612     """Information extractor for collegehumor.com"""
2613
2614     _WORKING = False
2615     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616     IE_NAME = u'collegehumor'
2617
2618     def report_manifest(self, video_id):
2619         """Report information extraction."""
2620         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621
2622     def report_extraction(self, video_id):
2623         """Report information extraction."""
2624         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625
2626     def _real_extract(self, url):
2627         mobj = re.match(self._VALID_URL, url)
2628         if mobj is None:
2629             self._downloader.report_error(u'invalid URL: %s' % url)
2630             return
2631         video_id = mobj.group('videoid')
2632
2633         info = {
2634             'id': video_id,
2635             'uploader': None,
2636             'upload_date': None,
2637         }
2638
2639         self.report_extraction(video_id)
2640         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641         try:
2642             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2645             return
2646
2647         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648         try:
2649             videoNode = mdoc.findall('./video')[0]
2650             info['description'] = videoNode.findall('./description')[0].text
2651             info['title'] = videoNode.findall('./caption')[0].text
2652             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653             manifest_url = videoNode.findall('./file')[0].text
2654         except IndexError:
2655             self._downloader.report_error(u'Invalid metadata XML file')
2656             return
2657
2658         manifest_url += '?hdcore=2.10.3'
2659         self.report_manifest(video_id)
2660         try:
2661             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2664             return
2665
2666         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667         try:
2668             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669             node_id = media_node.attrib['url']
2670             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671         except IndexError as err:
2672             self._downloader.report_error(u'Invalid manifest file')
2673             return
2674
2675         url_pr = compat_urllib_parse_urlparse(manifest_url)
2676         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2677
2678         info['url'] = url
2679         info['ext'] = 'f4f'
2680         return [info]
2681
2682
2683 class XVideosIE(InfoExtractor):
2684     """Information extractor for xvideos.com"""
2685
2686     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687     IE_NAME = u'xvideos'
2688
2689     def report_extraction(self, video_id):
2690         """Report information extraction."""
2691         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2692
2693     def _real_extract(self, url):
2694         mobj = re.match(self._VALID_URL, url)
2695         if mobj is None:
2696             self._downloader.report_error(u'invalid URL: %s' % url)
2697             return
2698         video_id = mobj.group(1)
2699
2700         webpage = self._download_webpage(url, video_id)
2701
2702         self.report_extraction(video_id)
2703
2704
2705         # Extract video URL
2706         mobj = re.search(r'flv_url=(.+?)&', webpage)
2707         if mobj is None:
2708             self._downloader.report_error(u'unable to extract video url')
2709             return
2710         video_url = compat_urllib_parse.unquote(mobj.group(1))
2711
2712
2713         # Extract title
2714         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2715         if mobj is None:
2716             self._downloader.report_error(u'unable to extract video title')
2717             return
2718         video_title = mobj.group(1)
2719
2720
2721         # Extract video thumbnail
2722         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2723         if mobj is None:
2724             self._downloader.report_error(u'unable to extract video thumbnail')
2725             return
2726         video_thumbnail = mobj.group(0)
2727
2728         info = {
2729             'id': video_id,
2730             'url': video_url,
2731             'uploader': None,
2732             'upload_date': None,
2733             'title': video_title,
2734             'ext': 'flv',
2735             'thumbnail': video_thumbnail,
2736             'description': None,
2737         }
2738
2739         return [info]
2740
2741
2742 class SoundcloudIE(InfoExtractor):
2743     """Information extractor for soundcloud.com
2744        To access the media, the uid of the song and a stream token
2745        must be extracted from the page source and the script must make
2746        a request to media.soundcloud.com/crossdomain.xml. Then
2747        the media can be grabbed by requesting from an url composed
2748        of the stream token and uid
2749      """
2750
2751     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752     IE_NAME = u'soundcloud'
2753
2754     def __init__(self, downloader=None):
2755         InfoExtractor.__init__(self, downloader)
2756
2757     def report_resolve(self, video_id):
2758         """Report information extraction."""
2759         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2760
2761     def report_extraction(self, video_id):
2762         """Report information extraction."""
2763         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2764
2765     def _real_extract(self, url):
2766         mobj = re.match(self._VALID_URL, url)
2767         if mobj is None:
2768             self._downloader.report_error(u'invalid URL: %s' % url)
2769             return
2770
2771         # extract uploader (which is in the url)
2772         uploader = mobj.group(1)
2773         # extract simple title (uploader + slug of song title)
2774         slug_title =  mobj.group(2)
2775         simple_title = uploader + u'-' + slug_title
2776
2777         self.report_resolve('%s/%s' % (uploader, slug_title))
2778
2779         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2780         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2781         request = compat_urllib_request.Request(resolv_url)
2782         try:
2783             info_json_bytes = compat_urllib_request.urlopen(request).read()
2784             info_json = info_json_bytes.decode('utf-8')
2785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2786             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2787             return
2788
2789         info = json.loads(info_json)
2790         video_id = info['id']
2791         self.report_extraction('%s/%s' % (uploader, slug_title))
2792
2793         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794         request = compat_urllib_request.Request(streams_url)
2795         try:
2796             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2797             stream_json = stream_json_bytes.decode('utf-8')
2798         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2800             return
2801
2802         streams = json.loads(stream_json)
2803         mediaURL = streams['http_mp3_128_url']
2804
2805         return [{
2806             'id':       info['id'],
2807             'url':      mediaURL,
2808             'uploader': info['user']['username'],
2809             'upload_date':  info['created_at'],
2810             'title':    info['title'],
2811             'ext':      u'mp3',
2812             'description': info['description'],
2813         }]
2814
2815 class SoundcloudSetIE(InfoExtractor):
2816     """Information extractor for soundcloud.com sets
2817        To access the media, the uid of the song and a stream token
2818        must be extracted from the page source and the script must make
2819        a request to media.soundcloud.com/crossdomain.xml. Then
2820        the media can be grabbed by requesting from an url composed
2821        of the stream token and uid
2822      """
2823
2824     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2825     IE_NAME = u'soundcloud'
2826
2827     def __init__(self, downloader=None):
2828         InfoExtractor.__init__(self, downloader)
2829
2830     def report_resolve(self, video_id):
2831         """Report information extraction."""
2832         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2833
2834     def report_extraction(self, video_id):
2835         """Report information extraction."""
2836         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2837
2838     def _real_extract(self, url):
2839         mobj = re.match(self._VALID_URL, url)
2840         if mobj is None:
2841             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2842             return
2843
2844         # extract uploader (which is in the url)
2845         uploader = mobj.group(1)
2846         # extract simple title (uploader + slug of song title)
2847         slug_title =  mobj.group(2)
2848         simple_title = uploader + u'-' + slug_title
2849
2850         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2851
2852         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2853         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2854         request = compat_urllib_request.Request(resolv_url)
2855         try:
2856             info_json_bytes = compat_urllib_request.urlopen(request).read()
2857             info_json = info_json_bytes.decode('utf-8')
2858         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2860             return
2861
2862         videos = []
2863         info = json.loads(info_json)
2864         if 'errors' in info:
2865             for err in info['errors']:
2866                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2867             return
2868
2869         for track in info['tracks']:
2870             video_id = track['id']
2871             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2872
2873             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2874             request = compat_urllib_request.Request(streams_url)
2875             try:
2876                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2877                 stream_json = stream_json_bytes.decode('utf-8')
2878             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2880                 return
2881
2882             streams = json.loads(stream_json)
2883             mediaURL = streams['http_mp3_128_url']
2884
2885             videos.append({
2886                 'id':       video_id,
2887                 'url':      mediaURL,
2888                 'uploader': track['user']['username'],
2889                 'upload_date':  track['created_at'],
2890                 'title':    track['title'],
2891                 'ext':      u'mp3',
2892                 'description': track['description'],
2893             })
2894         return videos
2895
2896
2897 class InfoQIE(InfoExtractor):
2898     """Information extractor for infoq.com"""
2899     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2900
2901     def report_extraction(self, video_id):
2902         """Report information extraction."""
2903         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2904
2905     def _real_extract(self, url):
2906         mobj = re.match(self._VALID_URL, url)
2907         if mobj is None:
2908             self._downloader.report_error(u'invalid URL: %s' % url)
2909             return
2910
2911         webpage = self._download_webpage(url, video_id=url)
2912         self.report_extraction(url)
2913
2914         # Extract video URL
2915         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2916         if mobj is None:
2917             self._downloader.report_error(u'unable to extract video url')
2918             return
2919         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2920         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2921
2922         # Extract title
2923         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2924         if mobj is None:
2925             self._downloader.report_error(u'unable to extract video title')
2926             return
2927         video_title = mobj.group(1)
2928
2929         # Extract description
2930         video_description = u'No description available.'
2931         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2932         if mobj is not None:
2933             video_description = mobj.group(1)
2934
2935         video_filename = video_url.split('/')[-1]
2936         video_id, extension = video_filename.split('.')
2937
2938         info = {
2939             'id': video_id,
2940             'url': video_url,
2941             'uploader': None,
2942             'upload_date': None,
2943             'title': video_title,
2944             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2945             'thumbnail': None,
2946             'description': video_description,
2947         }
2948
2949         return [info]
2950
2951 class MixcloudIE(InfoExtractor):
2952     """Information extractor for www.mixcloud.com"""
2953
2954     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2955     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2956     IE_NAME = u'mixcloud'
2957
2958     def __init__(self, downloader=None):
2959         InfoExtractor.__init__(self, downloader)
2960
2961     def report_download_json(self, file_id):
2962         """Report JSON download."""
2963         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2964
2965     def report_extraction(self, file_id):
2966         """Report information extraction."""
2967         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2968
2969     def get_urls(self, jsonData, fmt, bitrate='best'):
2970         """Get urls from 'audio_formats' section in json"""
2971         file_url = None
2972         try:
2973             bitrate_list = jsonData[fmt]
2974             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2975                 bitrate = max(bitrate_list) # select highest
2976
2977             url_list = jsonData[fmt][bitrate]
2978         except TypeError: # we have no bitrate info.
2979             url_list = jsonData[fmt]
2980         return url_list
2981
2982     def check_urls(self, url_list):
2983         """Returns 1st active url from list"""
2984         for url in url_list:
2985             try:
2986                 compat_urllib_request.urlopen(url)
2987                 return url
2988             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2989                 url = None
2990
2991         return None
2992
2993     def _print_formats(self, formats):
2994         print('Available formats:')
2995         for fmt in formats.keys():
2996             for b in formats[fmt]:
2997                 try:
2998                     ext = formats[fmt][b][0]
2999                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3000                 except TypeError: # we have no bitrate info
3001                     ext = formats[fmt][0]
3002                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3003                     break
3004
3005     def _real_extract(self, url):
3006         mobj = re.match(self._VALID_URL, url)
3007         if mobj is None:
3008             self._downloader.report_error(u'invalid URL: %s' % url)
3009             return
3010         # extract uploader & filename from url
3011         uploader = mobj.group(1).decode('utf-8')
3012         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3013
3014         # construct API request
3015         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3016         # retrieve .json file with links to files
3017         request = compat_urllib_request.Request(file_url)
3018         try:
3019             self.report_download_json(file_url)
3020             jsonData = compat_urllib_request.urlopen(request).read()
3021         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3023             return
3024
3025         # parse JSON
3026         json_data = json.loads(jsonData)
3027         player_url = json_data['player_swf_url']
3028         formats = dict(json_data['audio_formats'])
3029
3030         req_format = self._downloader.params.get('format', None)
3031         bitrate = None
3032
3033         if self._downloader.params.get('listformats', None):
3034             self._print_formats(formats)
3035             return
3036
3037         if req_format is None or req_format == 'best':
3038             for format_param in formats.keys():
3039                 url_list = self.get_urls(formats, format_param)
3040                 # check urls
3041                 file_url = self.check_urls(url_list)
3042                 if file_url is not None:
3043                     break # got it!
3044         else:
3045             if req_format not in formats:
3046                 self._downloader.report_error(u'format is not available')
3047                 return
3048
3049             url_list = self.get_urls(formats, req_format)
3050             file_url = self.check_urls(url_list)
3051             format_param = req_format
3052
3053         return [{
3054             'id': file_id.decode('utf-8'),
3055             'url': file_url.decode('utf-8'),
3056             'uploader': uploader.decode('utf-8'),
3057             'upload_date': None,
3058             'title': json_data['name'],
3059             'ext': file_url.split('.')[-1].decode('utf-8'),
3060             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3061             'thumbnail': json_data['thumbnail_url'],
3062             'description': json_data['description'],
3063             'player_url': player_url.decode('utf-8'),
3064         }]
3065
3066 class StanfordOpenClassroomIE(InfoExtractor):
3067     """Information extractor for Stanford's Open ClassRoom"""
3068
3069     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3070     IE_NAME = u'stanfordoc'
3071
3072     def report_download_webpage(self, objid):
3073         """Report information extraction."""
3074         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3075
3076     def report_extraction(self, video_id):
3077         """Report information extraction."""
3078         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3079
3080     def _real_extract(self, url):
3081         mobj = re.match(self._VALID_URL, url)
3082         if mobj is None:
3083             raise ExtractorError(u'Invalid URL: %s' % url)
3084
3085         if mobj.group('course') and mobj.group('video'): # A specific video
3086             course = mobj.group('course')
3087             video = mobj.group('video')
3088             info = {
3089                 'id': course + '_' + video,
3090                 'uploader': None,
3091                 'upload_date': None,
3092             }
3093
3094             self.report_extraction(info['id'])
3095             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3096             xmlUrl = baseUrl + video + '.xml'
3097             try:
3098                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3099             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3100                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3101                 return
3102             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3103             try:
3104                 info['title'] = mdoc.findall('./title')[0].text
3105                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3106             except IndexError:
3107                 self._downloader.report_error(u'Invalid metadata XML file')
3108                 return
3109             info['ext'] = info['url'].rpartition('.')[2]
3110             return [info]
3111         elif mobj.group('course'): # A course page
3112             course = mobj.group('course')
3113             info = {
3114                 'id': course,
3115                 'type': 'playlist',
3116                 'uploader': None,
3117                 'upload_date': None,
3118             }
3119
3120             coursepage = self._download_webpage(url, info['id'],
3121                                         note='Downloading course info page',
3122                                         errnote='Unable to download course info page')
3123
3124             m = re.search('<h1>([^<]+)</h1>', coursepage)
3125             if m:
3126                 info['title'] = unescapeHTML(m.group(1))
3127             else:
3128                 info['title'] = info['id']
3129
3130             m = re.search('<description>([^<]+)</description>', coursepage)
3131             if m:
3132                 info['description'] = unescapeHTML(m.group(1))
3133
3134             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3135             info['list'] = [
3136                 {
3137                     'type': 'reference',
3138                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3139                 }
3140                     for vpage in links]
3141             results = []
3142             for entry in info['list']:
3143                 assert entry['type'] == 'reference'
3144                 results += self.extract(entry['url'])
3145             return results
3146         else: # Root page
3147             info = {
3148                 'id': 'Stanford OpenClassroom',
3149                 'type': 'playlist',
3150                 'uploader': None,
3151                 'upload_date': None,
3152             }
3153
3154             self.report_download_webpage(info['id'])
3155             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3156             try:
3157                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3158             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3160                 return
3161
3162             info['title'] = info['id']
3163
3164             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3165             info['list'] = [
3166                 {
3167                     'type': 'reference',
3168                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3169                 }
3170                     for cpage in links]
3171
3172             results = []
3173             for entry in info['list']:
3174                 assert entry['type'] == 'reference'
3175                 results += self.extract(entry['url'])
3176             return results
3177
3178 class MTVIE(InfoExtractor):
3179     """Information extractor for MTV.com"""
3180
3181     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3182     IE_NAME = u'mtv'
3183
3184     def report_extraction(self, video_id):
3185         """Report information extraction."""
3186         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187
3188     def _real_extract(self, url):
3189         mobj = re.match(self._VALID_URL, url)
3190         if mobj is None:
3191             self._downloader.report_error(u'invalid URL: %s' % url)
3192             return
3193         if not mobj.group('proto'):
3194             url = 'http://' + url
3195         video_id = mobj.group('videoid')
3196
3197         webpage = self._download_webpage(url, video_id)
3198
3199         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3200         if mobj is None:
3201             self._downloader.report_error(u'unable to extract song name')
3202             return
3203         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3204         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3205         if mobj is None:
3206             self._downloader.report_error(u'unable to extract performer')
3207             return
3208         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209         video_title = performer + ' - ' + song_name
3210
3211         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3212         if mobj is None:
3213             self._downloader.report_error(u'unable to mtvn_uri')
3214             return
3215         mtvn_uri = mobj.group(1)
3216
3217         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3218         if mobj is None:
3219             self._downloader.report_error(u'unable to extract content id')
3220             return
3221         content_id = mobj.group(1)
3222
3223         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3224         self.report_extraction(video_id)
3225         request = compat_urllib_request.Request(videogen_url)
3226         try:
3227             metadataXml = compat_urllib_request.urlopen(request).read()
3228         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3229             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3230             return
3231
3232         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3233         renditions = mdoc.findall('.//rendition')
3234
3235         # For now, always pick the highest quality.
3236         rendition = renditions[-1]
3237
3238         try:
3239             _,_,ext = rendition.attrib['type'].partition('/')
3240             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3241             video_url = rendition.find('./src').text
3242         except KeyError:
3243             self._downloader.trouble('Invalid rendition field.')
3244             return
3245
3246         info = {
3247             'id': video_id,
3248             'url': video_url,
3249             'uploader': performer,
3250             'upload_date': None,
3251             'title': video_title,
3252             'ext': ext,
3253             'format': format,
3254         }
3255
3256         return [info]
3257
3258
3259 class YoukuIE(InfoExtractor):
3260     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3261
3262     def report_download_webpage(self, file_id):
3263         """Report webpage download."""
3264         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3265
3266     def report_extraction(self, file_id):
3267         """Report information extraction."""
3268         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3269
3270     def _gen_sid(self):
3271         nowTime = int(time.time() * 1000)
3272         random1 = random.randint(1000,1998)
3273         random2 = random.randint(1000,9999)
3274
3275         return "%d%d%d" %(nowTime,random1,random2)
3276
3277     def _get_file_ID_mix_string(self, seed):
3278         mixed = []
3279         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3280         seed = float(seed)
3281         for i in range(len(source)):
3282             seed  =  (seed * 211 + 30031 ) % 65536
3283             index  =  math.floor(seed / 65536 * len(source) )
3284             mixed.append(source[int(index)])
3285             source.remove(source[int(index)])
3286         #return ''.join(mixed)
3287         return mixed
3288
3289     def _get_file_id(self, fileId, seed):
3290         mixed = self._get_file_ID_mix_string(seed)
3291         ids = fileId.split('*')
3292         realId = []
3293         for ch in ids:
3294             if ch:
3295                 realId.append(mixed[int(ch)])
3296         return ''.join(realId)
3297
3298     def _real_extract(self, url):
3299         mobj = re.match(self._VALID_URL, url)
3300         if mobj is None:
3301             self._downloader.report_error(u'invalid URL: %s' % url)
3302             return
3303         video_id = mobj.group('ID')
3304
3305         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3306
3307         request = compat_urllib_request.Request(info_url, None, std_headers)
3308         try:
3309             self.report_download_webpage(video_id)
3310             jsondata = compat_urllib_request.urlopen(request).read()
3311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3312             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3313             return
3314
3315         self.report_extraction(video_id)
3316         try:
3317             jsonstr = jsondata.decode('utf-8')
3318             config = json.loads(jsonstr)
3319
3320             video_title =  config['data'][0]['title']
3321             seed = config['data'][0]['seed']
3322
3323             format = self._downloader.params.get('format', None)
3324             supported_format = list(config['data'][0]['streamfileids'].keys())
3325
3326             if format is None or format == 'best':
3327                 if 'hd2' in supported_format:
3328                     format = 'hd2'
3329                 else:
3330                     format = 'flv'
3331                 ext = u'flv'
3332             elif format == 'worst':
3333                 format = 'mp4'
3334                 ext = u'mp4'
3335             else:
3336                 format = 'flv'
3337                 ext = u'flv'
3338
3339
3340             fileid = config['data'][0]['streamfileids'][format]
3341             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3342         except (UnicodeDecodeError, ValueError, KeyError):
3343             self._downloader.report_error(u'unable to extract info section')
3344             return
3345
3346         files_info=[]
3347         sid = self._gen_sid()
3348         fileid = self._get_file_id(fileid, seed)
3349
3350         #column 8,9 of fileid represent the segment number
3351         #fileid[7:9] should be changed
3352         for index, key in enumerate(keys):
3353
3354             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3355             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3356
3357             info = {
3358                 'id': '%s_part%02d' % (video_id, index),
3359                 'url': download_url,
3360                 'uploader': None,
3361                 'upload_date': None,
3362                 'title': video_title,
3363                 'ext': ext,
3364             }
3365             files_info.append(info)
3366
3367         return files_info
3368
3369
3370 class XNXXIE(InfoExtractor):
3371     """Information extractor for xnxx.com"""
3372
3373     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3374     IE_NAME = u'xnxx'
3375     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3376     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3377     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3378
3379     def report_webpage(self, video_id):
3380         """Report information extraction"""
3381         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3382
3383     def report_extraction(self, video_id):
3384         """Report information extraction"""
3385         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3386
3387     def _real_extract(self, url):
3388         mobj = re.match(self._VALID_URL, url)
3389         if mobj is None:
3390             self._downloader.report_error(u'invalid URL: %s' % url)
3391             return
3392         video_id = mobj.group(1)
3393
3394         self.report_webpage(video_id)
3395
3396         # Get webpage content
3397         try:
3398             webpage_bytes = compat_urllib_request.urlopen(url).read()
3399             webpage = webpage_bytes.decode('utf-8')
3400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3402             return
3403
3404         result = re.search(self.VIDEO_URL_RE, webpage)
3405         if result is None:
3406             self._downloader.report_error(u'unable to extract video url')
3407             return
3408         video_url = compat_urllib_parse.unquote(result.group(1))
3409
3410         result = re.search(self.VIDEO_TITLE_RE, webpage)
3411         if result is None:
3412             self._downloader.report_error(u'unable to extract video title')
3413             return
3414         video_title = result.group(1)
3415
3416         result = re.search(self.VIDEO_THUMB_RE, webpage)
3417         if result is None:
3418             self._downloader.report_error(u'unable to extract video thumbnail')
3419             return
3420         video_thumbnail = result.group(1)
3421
3422         return [{
3423             'id': video_id,
3424             'url': video_url,
3425             'uploader': None,
3426             'upload_date': None,
3427             'title': video_title,
3428             'ext': 'flv',
3429             'thumbnail': video_thumbnail,
3430             'description': None,
3431         }]
3432
3433
3434 class GooglePlusIE(InfoExtractor):
3435     """Information extractor for plus.google.com."""
3436
3437     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3438     IE_NAME = u'plus.google'
3439
3440     def __init__(self, downloader=None):
3441         InfoExtractor.__init__(self, downloader)
3442
3443     def report_extract_entry(self, url):
3444         """Report downloading extry"""
3445         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3446
3447     def report_date(self, upload_date):
3448         """Report downloading extry"""
3449         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3450
3451     def report_uploader(self, uploader):
3452         """Report downloading extry"""
3453         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3454
3455     def report_title(self, video_title):
3456         """Report downloading extry"""
3457         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3458
3459     def report_extract_vid_page(self, video_page):
3460         """Report information extraction."""
3461         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3462
3463     def _real_extract(self, url):
3464         # Extract id from URL
3465         mobj = re.match(self._VALID_URL, url)
3466         if mobj is None:
3467             self._downloader.report_error(u'Invalid URL: %s' % url)
3468             return
3469
3470         post_url = mobj.group(0)
3471         video_id = mobj.group(1)
3472
3473         video_extension = 'flv'
3474
3475         # Step 1, Retrieve post webpage to extract further information
3476         self.report_extract_entry(post_url)
3477         request = compat_urllib_request.Request(post_url)
3478         try:
3479             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3480         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3481             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3482             return
3483
3484         # Extract update date
3485         upload_date = None
3486         pattern = 'title="Timestamp">(.*?)</a>'
3487         mobj = re.search(pattern, webpage)
3488         if mobj:
3489             upload_date = mobj.group(1)
3490             # Convert timestring to a format suitable for filename
3491             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3492             upload_date = upload_date.strftime('%Y%m%d')
3493         self.report_date(upload_date)
3494
3495         # Extract uploader
3496         uploader = None
3497         pattern = r'rel\="author".*?>(.*?)</a>'
3498         mobj = re.search(pattern, webpage)
3499         if mobj:
3500             uploader = mobj.group(1)
3501         self.report_uploader(uploader)
3502
3503         # Extract title
3504         # Get the first line for title
3505         video_title = u'NA'
3506         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3507         mobj = re.search(pattern, webpage)
3508         if mobj:
3509             video_title = mobj.group(1)
3510         self.report_title(video_title)
3511
3512         # Step 2, Stimulate clicking the image box to launch video
3513         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3514         mobj = re.search(pattern, webpage)
3515         if mobj is None:
3516             self._downloader.report_error(u'unable to extract video page URL')
3517
3518         video_page = mobj.group(1)
3519         request = compat_urllib_request.Request(video_page)
3520         try:
3521             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3522         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3523             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3524             return
3525         self.report_extract_vid_page(video_page)
3526
3527
3528         # Extract video links on video page
3529         """Extract video links of all sizes"""
3530         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3531         mobj = re.findall(pattern, webpage)
3532         if len(mobj) == 0:
3533             self._downloader.report_error(u'unable to extract video links')
3534
3535         # Sort in resolution
3536         links = sorted(mobj)
3537
3538         # Choose the lowest of the sort, i.e. highest resolution
3539         video_url = links[-1]
3540         # Only get the url. The resolution part in the tuple has no use anymore
3541         video_url = video_url[-1]
3542         # Treat escaped \u0026 style hex
3543         try:
3544             video_url = video_url.decode("unicode_escape")
3545         except AttributeError: # Python 3
3546             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3547
3548
3549         return [{
3550             'id':       video_id,
3551             'url':      video_url,
3552             'uploader': uploader,
3553             'upload_date':  upload_date,
3554             'title':    video_title,
3555             'ext':      video_extension,
3556         }]
3557
3558 class NBAIE(InfoExtractor):
3559     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3560     IE_NAME = u'nba'
3561
3562     def _real_extract(self, url):
3563         mobj = re.match(self._VALID_URL, url)
3564         if mobj is None:
3565             self._downloader.report_error(u'invalid URL: %s' % url)
3566             return
3567
3568         video_id = mobj.group(1)
3569         if video_id.endswith('/index.html'):
3570             video_id = video_id[:-len('/index.html')]
3571
3572         webpage = self._download_webpage(url, video_id)
3573
3574         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3575         def _findProp(rexp, default=None):
3576             m = re.search(rexp, webpage)
3577             if m:
3578                 return unescapeHTML(m.group(1))
3579             else:
3580                 return default
3581
3582         shortened_video_id = video_id.rpartition('/')[2]
3583         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3584         info = {
3585             'id': shortened_video_id,
3586             'url': video_url,
3587             'ext': 'mp4',
3588             'title': title,
3589             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3590             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3591         }
3592         return [info]
3593
3594 class JustinTVIE(InfoExtractor):
3595     """Information extractor for justin.tv and twitch.tv"""
3596     # TODO: One broadcast may be split into multiple videos. The key
3597     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3598     # starts at 1 and increases. Can we treat all parts as one video?
3599
3600     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3601         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3602     _JUSTIN_PAGE_LIMIT = 100
3603     IE_NAME = u'justin.tv'
3604
3605     def report_extraction(self, file_id):
3606         """Report information extraction."""
3607         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3608
3609     def report_download_page(self, channel, offset):
3610         """Report attempt to download a single page of videos."""
3611         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3612                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3613
3614     # Return count of items, list of *valid* items
3615     def _parse_page(self, url):
3616         try:
3617             urlh = compat_urllib_request.urlopen(url)
3618             webpage_bytes = urlh.read()
3619             webpage = webpage_bytes.decode('utf-8', 'ignore')
3620         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3621             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3622             return
3623
3624         response = json.loads(webpage)
3625         if type(response) != list:
3626             error_text = response.get('error', 'unknown error')
3627             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3628             return
3629         info = []
3630         for clip in response:
3631             video_url = clip['video_file_url']
3632             if video_url:
3633                 video_extension = os.path.splitext(video_url)[1][1:]
3634                 video_date = re.sub('-', '', clip['start_time'][:10])
3635                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3636                 video_id = clip['id']
3637                 video_title = clip.get('title', video_id)
3638                 info.append({
3639                     'id': video_id,
3640                     'url': video_url,
3641                     'title': video_title,
3642                     'uploader': clip.get('channel_name', video_uploader_id),
3643                     'uploader_id': video_uploader_id,
3644                     'upload_date': video_date,
3645                     'ext': video_extension,
3646                 })
3647         return (len(response), info)
3648
3649     def _real_extract(self, url):
3650         mobj = re.match(self._VALID_URL, url)
3651         if mobj is None:
3652             self._downloader.report_error(u'invalid URL: %s' % url)
3653             return
3654
3655         api = 'http://api.justin.tv'
3656         video_id = mobj.group(mobj.lastindex)
3657         paged = False
3658         if mobj.lastindex == 1:
3659             paged = True
3660             api += '/channel/archives/%s.json'
3661         else:
3662             api += '/broadcast/by_archive/%s.json'
3663         api = api % (video_id,)
3664
3665         self.report_extraction(video_id)
3666
3667         info = []
3668         offset = 0
3669         limit = self._JUSTIN_PAGE_LIMIT
3670         while True:
3671             if paged:
3672                 self.report_download_page(video_id, offset)
3673             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3674             page_count, page_info = self._parse_page(page_url)
3675             info.extend(page_info)
3676             if not paged or page_count != limit:
3677                 break
3678             offset += limit
3679         return info
3680
3681 class FunnyOrDieIE(InfoExtractor):
3682     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3683
3684     def _real_extract(self, url):
3685         mobj = re.match(self._VALID_URL, url)
3686         if mobj is None:
3687             self._downloader.report_error(u'invalid URL: %s' % url)
3688             return
3689
3690         video_id = mobj.group('id')
3691         webpage = self._download_webpage(url, video_id)
3692
3693         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3694         if not m:
3695             self._downloader.report_error(u'unable to find video information')
3696         video_url = unescapeHTML(m.group('url'))
3697
3698         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3699         if not m:
3700             self._downloader.trouble(u'Cannot find video title')
3701         title = clean_html(m.group('title'))
3702
3703         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3704         if m:
3705             desc = unescapeHTML(m.group('desc'))
3706         else:
3707             desc = None
3708
3709         info = {
3710             'id': video_id,
3711             'url': video_url,
3712             'ext': 'mp4',
3713             'title': title,
3714             'description': desc,
3715         }
3716         return [info]
3717
3718 class SteamIE(InfoExtractor):
3719     _VALID_URL = r"""http://store.steampowered.com/
3720                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3721                 (?P<gameID>\d+)/?
3722                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3723                 """
3724
3725     @classmethod
3726     def suitable(cls, url):
3727         """Receives a URL and returns True if suitable for this IE."""
3728         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3729
3730     def _real_extract(self, url):
3731         m = re.match(self._VALID_URL, url, re.VERBOSE)
3732         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3733         gameID = m.group('gameID')
3734         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3735         webpage = self._download_webpage(videourl, gameID)
3736         mweb = re.finditer(urlRE, webpage)
3737         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3738         titles = re.finditer(namesRE, webpage)
3739         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3740         thumbs = re.finditer(thumbsRE, webpage)
3741         videos = []
3742         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3743             video_id = vid.group('videoID')
3744             title = vtitle.group('videoName')
3745             video_url = vid.group('videoURL')
3746             video_thumb = thumb.group('thumbnail')
3747             if not video_url:
3748                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3749             info = {
3750                 'id':video_id,
3751                 'url':video_url,
3752                 'ext': 'flv',
3753                 'title': unescapeHTML(title),
3754                 'thumbnail': video_thumb
3755                   }
3756             videos.append(info)
3757         return videos
3758
3759 class UstreamIE(InfoExtractor):
3760     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3761     IE_NAME = u'ustream'
3762
3763     def _real_extract(self, url):
3764         m = re.match(self._VALID_URL, url)
3765         video_id = m.group('videoID')
3766         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3767         webpage = self._download_webpage(url, video_id)
3768         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3769         title = m.group('title')
3770         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3771         uploader = m.group('uploader')
3772         info = {
3773                 'id':video_id,
3774                 'url':video_url,
3775                 'ext': 'flv',
3776                 'title': title,
3777                 'uploader': uploader
3778                   }
3779         return [info]
3780
3781 class WorldStarHipHopIE(InfoExtractor):
3782     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3783     IE_NAME = u'WorldStarHipHop'
3784
3785     def _real_extract(self, url):
3786         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3787
3788         webpage_src = compat_urllib_request.urlopen(url).read()
3789         webpage_src = webpage_src.decode('utf-8')
3790
3791         mobj = re.search(_src_url, webpage_src)
3792
3793         m = re.match(self._VALID_URL, url)
3794         video_id = m.group('id')
3795
3796         if mobj is not None:
3797             video_url = mobj.group()
3798             if 'mp4' in video_url:
3799                 ext = 'mp4'
3800             else:
3801                 ext = 'flv'
3802         else:
3803             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3804             return
3805
3806         _title = r"""<title>(.*)</title>"""
3807
3808         mobj = re.search(_title, webpage_src)
3809         
3810         if mobj is not None:
3811             title = mobj.group(1)
3812         else:
3813             title = 'World Start Hip Hop - %s' % time.ctime()
3814
3815         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3816         mobj = re.search(_thumbnail, webpage_src)
3817
3818         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3819         if mobj is not None:
3820             thumbnail = mobj.group(1)
3821         else:
3822             _title = r"""candytitles.*>(.*)</span>"""
3823             mobj = re.search(_title, webpage_src)
3824             if mobj is not None:
3825                 title = mobj.group(1)
3826             thumbnail = None
3827         
3828         results = [{
3829                     'id': video_id,
3830                     'url' : video_url,
3831                     'title' : title,
3832                     'thumbnail' : thumbnail,
3833                     'ext' : ext,
3834                     }]
3835         return results
3836
3837 class RBMARadioIE(InfoExtractor):
3838     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3839
3840     def _real_extract(self, url):
3841         m = re.match(self._VALID_URL, url)
3842         video_id = m.group('videoID')
3843
3844         webpage = self._download_webpage(url, video_id)
3845         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3846         if not m:
3847             raise ExtractorError(u'Cannot find metadata')
3848         json_data = m.group(1)
3849
3850         try:
3851             data = json.loads(json_data)
3852         except ValueError as e:
3853             raise ExtractorError(u'Invalid JSON: ' + str(e))
3854
3855         video_url = data['akamai_url'] + '&cbr=256'
3856         url_parts = compat_urllib_parse_urlparse(video_url)
3857         video_ext = url_parts.path.rpartition('.')[2]
3858         info = {
3859                 'id': video_id,
3860                 'url': video_url,
3861                 'ext': video_ext,
3862                 'title': data['title'],
3863                 'description': data.get('teaser_text'),
3864                 'location': data.get('country_of_origin'),
3865                 'uploader': data.get('host', {}).get('name'),
3866                 'uploader_id': data.get('host', {}).get('slug'),
3867                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3868                 'duration': data.get('duration'),
3869         }
3870         return [info]
3871
3872
3873 class YouPornIE(InfoExtractor):
3874     """Information extractor for youporn.com."""
3875     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3876
3877     def _print_formats(self, formats):
3878         """Print all available formats"""
3879         print(u'Available formats:')
3880         print(u'ext\t\tformat')
3881         print(u'---------------------------------')
3882         for format in formats:
3883             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3884
3885     def _specific(self, req_format, formats):
3886         for x in formats:
3887             if(x["format"]==req_format):
3888                 return x
3889         return None
3890
3891     def _real_extract(self, url):
3892         mobj = re.match(self._VALID_URL, url)
3893         if mobj is None:
3894             self._downloader.report_error(u'invalid URL: %s' % url)
3895             return
3896
3897         video_id = mobj.group('videoid')
3898
3899         req = compat_urllib_request.Request(url)
3900         req.add_header('Cookie', 'age_verified=1')
3901         webpage = self._download_webpage(req, video_id)
3902
3903         # Get the video title
3904         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3905         if result is None:
3906             raise ExtractorError(u'Unable to extract video title')
3907         video_title = result.group('title').strip()
3908
3909         # Get the video date
3910         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3911         if result is None:
3912             self._downloader.report_warning(u'unable to extract video date')
3913             upload_date = None
3914         else:
3915             upload_date = result.group('date').strip()
3916
3917         # Get the video uploader
3918         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3919         if result is None:
3920             self._downloader.report_warning(u'unable to extract uploader')
3921             video_uploader = None
3922         else:
3923             video_uploader = result.group('uploader').strip()
3924             video_uploader = clean_html( video_uploader )
3925
3926         # Get all of the formats available
3927         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3928         result = re.search(DOWNLOAD_LIST_RE, webpage)
3929         if result is None:
3930             raise ExtractorError(u'Unable to extract download list')
3931         download_list_html = result.group('download_list').strip()
3932
3933         # Get all of the links from the page
3934         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3935         links = re.findall(LINK_RE, download_list_html)
3936         if(len(links) == 0):
3937             raise ExtractorError(u'ERROR: no known formats available for video')
3938
3939         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3940
3941         formats = []
3942         for link in links:
3943
3944             # A link looks like this:
3945             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3946             # A path looks like this:
3947             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3948             video_url = unescapeHTML( link )
3949             path = compat_urllib_parse_urlparse( video_url ).path
3950             extension = os.path.splitext( path )[1][1:]
3951             format = path.split('/')[4].split('_')[:2]
3952             size = format[0]
3953             bitrate = format[1]
3954             format = "-".join( format )
3955             title = u'%s-%s-%s' % (video_title, size, bitrate)
3956
3957             formats.append({
3958                 'id': video_id,
3959                 'url': video_url,
3960                 'uploader': video_uploader,
3961                 'upload_date': upload_date,
3962                 'title': title,
3963                 'ext': extension,
3964                 'format': format,
3965                 'thumbnail': None,
3966                 'description': None,
3967                 'player_url': None
3968             })
3969
3970         if self._downloader.params.get('listformats', None):
3971             self._print_formats(formats)
3972             return
3973
3974         req_format = self._downloader.params.get('format', None)
3975         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3976
3977         if req_format is None or req_format == 'best':
3978             return [formats[0]]
3979         elif req_format == 'worst':
3980             return [formats[-1]]
3981         elif req_format in ('-1', 'all'):
3982             return formats
3983         else:
3984             format = self._specific( req_format, formats )
3985             if result is None:
3986                 self._downloader.report_error(u'requested format not available')
3987                 return
3988             return [format]
3989
3990
3991
3992 class PornotubeIE(InfoExtractor):
3993     """Information extractor for pornotube.com."""
3994     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3995
3996     def _real_extract(self, url):
3997         mobj = re.match(self._VALID_URL, url)
3998         if mobj is None:
3999             self._downloader.report_error(u'invalid URL: %s' % url)
4000             return
4001
4002         video_id = mobj.group('videoid')
4003         video_title = mobj.group('title')
4004
4005         # Get webpage content
4006         webpage = self._download_webpage(url, video_id)
4007
4008         # Get the video URL
4009         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4010         result = re.search(VIDEO_URL_RE, webpage)
4011         if result is None:
4012             self._downloader.report_error(u'unable to extract video url')
4013             return
4014         video_url = compat_urllib_parse.unquote(result.group('url'))
4015
4016         #Get the uploaded date
4017         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4018         result = re.search(VIDEO_UPLOADED_RE, webpage)
4019         if result is None:
4020             self._downloader.report_error(u'unable to extract video title')
4021             return
4022         upload_date = result.group('date')
4023
4024         info = {'id': video_id,
4025                 'url': video_url,
4026                 'uploader': None,
4027                 'upload_date': upload_date,
4028                 'title': video_title,
4029                 'ext': 'flv',
4030                 'format': 'flv'}
4031
4032         return [info]
4033
4034 class YouJizzIE(InfoExtractor):
4035     """Information extractor for youjizz.com."""
4036     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4037
4038     def _real_extract(self, url):
4039         mobj = re.match(self._VALID_URL, url)
4040         if mobj is None:
4041             self._downloader.report_error(u'invalid URL: %s' % url)
4042             return
4043
4044         video_id = mobj.group('videoid')
4045
4046         # Get webpage content
4047         webpage = self._download_webpage(url, video_id)
4048
4049         # Get the video title
4050         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4051         if result is None:
4052             raise ExtractorError(u'ERROR: unable to extract video title')
4053         video_title = result.group('title').strip()
4054
4055         # Get the embed page
4056         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4057         if result is None:
4058             raise ExtractorError(u'ERROR: unable to extract embed page')
4059
4060         embed_page_url = result.group(0).strip()
4061         video_id = result.group('videoid')
4062
4063         webpage = self._download_webpage(embed_page_url, video_id)
4064
4065         # Get the video URL
4066         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4067         if result is None:
4068             raise ExtractorError(u'ERROR: unable to extract video url')
4069         video_url = result.group('source')
4070
4071         info = {'id': video_id,
4072                 'url': video_url,
4073                 'title': video_title,
4074                 'ext': 'flv',
4075                 'format': 'flv',
4076                 'player_url': embed_page_url}
4077
4078         return [info]
4079
4080 class EightTracksIE(InfoExtractor):
4081     IE_NAME = '8tracks'
4082     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4083
4084     def _real_extract(self, url):
4085         mobj = re.match(self._VALID_URL, url)
4086         if mobj is None:
4087             raise ExtractorError(u'Invalid URL: %s' % url)
4088         playlist_id = mobj.group('id')
4089
4090         webpage = self._download_webpage(url, playlist_id)
4091
4092         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4093         if not m:
4094             raise ExtractorError(u'Cannot find trax information')
4095         json_like = m.group(1)
4096         data = json.loads(json_like)
4097
4098         session = str(random.randint(0, 1000000000))
4099         mix_id = data['id']
4100         track_count = data['tracks_count']
4101         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4102         next_url = first_url
4103         res = []
4104         for i in itertools.count():
4105             api_json = self._download_webpage(next_url, playlist_id,
4106                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4107                 errnote=u'Failed to download song information')
4108             api_data = json.loads(api_json)
4109             track_data = api_data[u'set']['track']
4110             info = {
4111                 'id': track_data['id'],
4112                 'url': track_data['track_file_stream_url'],
4113                 'title': track_data['performer'] + u' - ' + track_data['name'],
4114                 'raw_title': track_data['name'],
4115                 'uploader_id': data['user']['login'],
4116                 'ext': 'm4a',
4117             }
4118             res.append(info)
4119             if api_data['set']['at_last_track']:
4120                 break
4121             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4122         return res
4123
4124 class KeekIE(InfoExtractor):
4125     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4126     IE_NAME = u'keek'
4127
4128     def _real_extract(self, url):
4129         m = re.match(self._VALID_URL, url)
4130         video_id = m.group('videoID')
4131         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4132         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4133         webpage = self._download_webpage(url, video_id)
4134         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4135         title = unescapeHTML(m.group('title'))
4136         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4137         uploader = clean_html(m.group('uploader'))
4138         info = {
4139                 'id': video_id,
4140                 'url': video_url,
4141                 'ext': 'mp4',
4142                 'title': title,
4143                 'thumbnail': thumbnail,
4144                 'uploader': uploader
4145         }
4146         return [info]
4147
4148 class TEDIE(InfoExtractor):
4149     _VALID_URL=r'''http://www.ted.com/
4150                    (
4151                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4152                         |
4153                         ((?P<type_talk>talks)) # We have a simple talk
4154                    )
4155                    /(?P<name>\w+) # Here goes the name and then ".html"
4156                    '''
4157
4158     @classmethod
4159     def suitable(cls, url):
4160         """Receives a URL and returns True if suitable for this IE."""
4161         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4162
4163     def _real_extract(self, url):
4164         m=re.match(self._VALID_URL, url, re.VERBOSE)
4165         if m.group('type_talk'):
4166             return [self._talk_info(url)]
4167         else :
4168             playlist_id=m.group('playlist_id')
4169             name=m.group('name')
4170             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4171             return self._playlist_videos_info(url,name,playlist_id)
4172
4173     def _talk_video_link(self,mediaSlug):
4174         '''Returns the video link for that mediaSlug'''
4175         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4176
4177     def _playlist_videos_info(self,url,name,playlist_id=0):
4178         '''Returns the videos of the playlist'''
4179         video_RE=r'''
4180                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4181                      ([.\s]*?)data-playlist_item_id="(\d+)"
4182                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4183                      '''
4184         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4185         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4186         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4187         m_names=re.finditer(video_name_RE,webpage)
4188         info=[]
4189         for m_video, m_name in zip(m_videos,m_names):
4190             video_id=m_video.group('video_id')
4191             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4192             info.append(self._talk_info(talk_url,video_id))
4193         return info
4194
4195     def _talk_info(self, url, video_id=0):
4196         """Return the video for the talk in the url"""
4197         m=re.match(self._VALID_URL, url,re.VERBOSE)
4198         videoName=m.group('name')
4199         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4200         # If the url includes the language we get the title translated
4201         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4202         title=re.search(title_RE, webpage).group('title')
4203         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4204                         "id":(?P<videoID>[\d]+).*?
4205                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4206         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4207         thumb_match=re.search(thumb_RE,webpage)
4208         info_match=re.search(info_RE,webpage,re.VERBOSE)
4209         video_id=info_match.group('videoID')
4210         mediaSlug=info_match.group('mediaSlug')
4211         video_url=self._talk_video_link(mediaSlug)
4212         info = {
4213                 'id': video_id,
4214                 'url': video_url,
4215                 'ext': 'mp4',
4216                 'title': title,
4217                 'thumbnail': thumb_match.group('thumbnail')
4218                 }
4219         return info
4220
4221 class MySpassIE(InfoExtractor):
4222     _VALID_URL = r'http://www.myspass.de/.*'
4223
4224     def _real_extract(self, url):
4225         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4226
4227         # video id is the last path element of the URL
4228         # usually there is a trailing slash, so also try the second but last
4229         url_path = compat_urllib_parse_urlparse(url).path
4230         url_parent_path, video_id = os.path.split(url_path)
4231         if not video_id:
4232             _, video_id = os.path.split(url_parent_path)
4233
4234         # get metadata
4235         metadata_url = META_DATA_URL_TEMPLATE % video_id
4236         metadata_text = self._download_webpage(metadata_url, video_id)
4237         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4238
4239         # extract values from metadata
4240         url_flv_el = metadata.find('url_flv')
4241         if url_flv_el is None:
4242             self._downloader.report_error(u'unable to extract download url')
4243             return
4244         video_url = url_flv_el.text
4245         extension = os.path.splitext(video_url)[1][1:]
4246         title_el = metadata.find('title')
4247         if title_el is None:
4248             self._downloader.report_error(u'unable to extract title')
4249             return
4250         title = title_el.text
4251         format_id_el = metadata.find('format_id')
4252         if format_id_el is None:
4253             format = ext
4254         else:
4255             format = format_id_el.text
4256         description_el = metadata.find('description')
4257         if description_el is not None:
4258             description = description_el.text
4259         else:
4260             description = None
4261         imagePreview_el = metadata.find('imagePreview')
4262         if imagePreview_el is not None:
4263             thumbnail = imagePreview_el.text
4264         else:
4265             thumbnail = None
4266         info = {
4267             'id': video_id,
4268             'url': video_url,
4269             'title': title,
4270             'ext': extension,
4271             'format': format,
4272             'thumbnail': thumbnail,
4273             'description': description
4274         }
4275         return [info]
4276
4277 class SpiegelIE(InfoExtractor):
4278     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4279
4280     def _real_extract(self, url):
4281         m = re.match(self._VALID_URL, url)
4282         video_id = m.group('videoID')
4283
4284         webpage = self._download_webpage(url, video_id)
4285         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4286         if not m:
4287             raise ExtractorError(u'Cannot find title')
4288         video_title = unescapeHTML(m.group(1))
4289
4290         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4291         xml_code = self._download_webpage(xml_url, video_id,
4292                     note=u'Downloading XML', errnote=u'Failed to download XML')
4293
4294         idoc = xml.etree.ElementTree.fromstring(xml_code)
4295         last_type = idoc[-1]
4296         filename = last_type.findall('./filename')[0].text
4297         duration = float(last_type.findall('./duration')[0].text)
4298
4299         video_url = 'http://video2.spiegel.de/flash/' + filename
4300         video_ext = filename.rpartition('.')[2]
4301         info = {
4302             'id': video_id,
4303             'url': video_url,
4304             'ext': video_ext,
4305             'title': video_title,
4306             'duration': duration,
4307         }
4308         return [info]
4309
4310 class LiveLeakIE(InfoExtractor):
4311
4312     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4313     IE_NAME = u'liveleak'
4314
4315     def _real_extract(self, url):
4316         mobj = re.match(self._VALID_URL, url)
4317         if mobj is None:
4318             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4319             return
4320
4321         video_id = mobj.group('video_id')
4322
4323         webpage = self._download_webpage(url, video_id)
4324
4325         m = re.search(r'file: "(.*?)",', webpage)
4326         if not m:
4327             self._downloader.report_error(u'unable to find video url')
4328             return
4329         video_url = m.group(1)
4330
4331         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4332         if not m:
4333             self._downloader.trouble(u'Cannot find video title')
4334         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4335
4336         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4337         if m:
4338             desc = unescapeHTML(m.group('desc'))
4339         else:
4340             desc = None
4341
4342         m = re.search(r'By:.*?(\w+)</a>', webpage)
4343         if m:
4344             uploader = clean_html(m.group(1))
4345         else:
4346             uploader = None
4347
4348         info = {
4349             'id':  video_id,
4350             'url': video_url,
4351             'ext': 'mp4',
4352             'title': title,
4353             'description': desc,
4354             'uploader': uploader
4355         }
4356
4357         return [info]
4358
4359
4360 def gen_extractors():
4361     """ Return a list of an instance of every supported extractor.
4362     The order does matter; the first extractor matched is the one handling the URL.
4363     """
4364     return [
4365         YoutubePlaylistIE(),
4366         YoutubeChannelIE(),
4367         YoutubeUserIE(),
4368         YoutubeSearchIE(),
4369         YoutubeIE(),
4370         MetacafeIE(),
4371         DailymotionIE(),
4372         GoogleSearchIE(),
4373         PhotobucketIE(),
4374         YahooIE(),
4375         YahooSearchIE(),
4376         DepositFilesIE(),
4377         FacebookIE(),
4378         BlipTVUserIE(),
4379         BlipTVIE(),
4380         VimeoIE(),
4381         MyVideoIE(),
4382         ComedyCentralIE(),
4383         EscapistIE(),
4384         CollegeHumorIE(),
4385         XVideosIE(),
4386         SoundcloudSetIE(),
4387         SoundcloudIE(),
4388         InfoQIE(),
4389         MixcloudIE(),
4390         StanfordOpenClassroomIE(),
4391         MTVIE(),
4392         YoukuIE(),
4393         XNXXIE(),
4394         YouJizzIE(),
4395         PornotubeIE(),
4396         YouPornIE(),
4397         GooglePlusIE(),
4398         ArteTvIE(),
4399         NBAIE(),
4400         WorldStarHipHopIE(),
4401         JustinTVIE(),
4402         FunnyOrDieIE(),
4403         SteamIE(),
4404         UstreamIE(),
4405         RBMARadioIE(),
4406         EightTracksIE(),
4407         KeekIE(),
4408         TEDIE(),
4409         MySpassIE(),
4410         SpiegelIE(),
4411         LiveLeakIE(),
4412         GenericIE()
4413     ]