Refactor subtitle options from srt to the more generic 'sub'.
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23     """Information Extractor class.
24
25     Information extractors are the classes that, given a URL, extract
26     information about the video (or videos) the URL refers to. This
27     information includes the real video URL, the video title, author and
28     others. The information is stored in a dictionary which is then
29     passed to the FileDownloader. The FileDownloader processes this
30     information possibly downloading the video to the file system, among
31     other possible outcomes.
32
33     The dictionaries must include the following fields:
34
35     id:             Video identifier.
36     url:            Final video URL.
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader:       Full name of the video uploader.
46     upload_date:    Video upload date (YYYYMMDD).
47     uploader_id:    Nickname or id of the video uploader.
48     location:       Physical location of the video.
49     player_url:     SWF Player URL (used for rtmpdump).
50     subtitles:      The subtitle file contents.
51     urlhandle:      [internal] The urlHandle to be used to download the file,
52                     like returned by urllib.request.urlopen
53
54     The fields should all be Unicode strings.
55
56     Subclasses of this one should re-define the _real_initialize() and
57     _real_extract() methods and define a _VALID_URL regexp.
58     Probably, they should also be added to the list of extractors.
59
60     _real_extract() must return a *list* of information dictionaries as
61     described above.
62
63     Finally, the _WORKING attribute should be set to False for broken IEs
64     in order to warn the users and skip the tests.
65     """
66
67     _ready = False
68     _downloader = None
69     _WORKING = True
70
71     def __init__(self, downloader=None):
72         """Constructor. Receives an optional downloader."""
73         self._ready = False
74         self.set_downloader(downloader)
75
76     def suitable(self, url):
77         """Receives a URL and returns True if suitable for this IE."""
78         return re.match(self._VALID_URL, url) is not None
79
80     def working(self):
81         """Getter method for _WORKING."""
82         return self._WORKING
83
84     def initialize(self):
85         """Initializes an instance (authentication, etc)."""
86         if not self._ready:
87             self._real_initialize()
88             self._ready = True
89
90     def extract(self, url):
91         """Extracts URL information and returns it in list of dicts."""
92         self.initialize()
93         return self._real_extract(url)
94
95     def set_downloader(self, downloader):
96         """Sets the downloader for this IE."""
97         self._downloader = downloader
98
99     def _real_initialize(self):
100         """Real initialization process. Redefine in subclasses."""
101         pass
102
103     def _real_extract(self, url):
104         """Real extraction process. Redefine in subclasses."""
105         pass
106
107     @property
108     def IE_NAME(self):
109         return type(self).__name__[:-2]
110
111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112         """ Returns the response handle """
113         if note is None:
114             note = u'Downloading video webpage'
115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116         try:
117             return compat_urllib_request.urlopen(url_or_request)
118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119             if errnote is None:
120                 errnote = u'Unable to download webpage'
121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124         """ Returns the data of the page as a string """
125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126         webpage_bytes = urlh.read()
127         return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131     """Information extractor for youtube.com."""
132
133     _VALID_URL = r"""^
134                      (
135                          (?:https?://)?                                       # http(s):// (optional)
136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
140                          (?:                                                  # the various things that can precede the ID:
141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
142                              |(?:                                             # or the v= param in all its forms
143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
146                                  v=
147                              )
148                          )?                                                   # optional -> youtube.com/xxxx is OK
149                      )?                                                       # all until now is optional -> you can pass the naked ID
150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
151                      (?(1).+)?                                                # if we found the ID, everything can follow
152                      $"""
153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157     _NETRC_MACHINE = 'youtube'
158     # Listed in order of quality
159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161     _video_extensions = {
162         '13': '3gp',
163         '17': 'mp4',
164         '18': 'mp4',
165         '22': 'mp4',
166         '37': 'mp4',
167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168         '43': 'webm',
169         '44': 'webm',
170         '45': 'webm',
171         '46': 'webm',
172     }
173     _video_dimensions = {
174         '5': '240x400',
175         '6': '???',
176         '13': '???',
177         '17': '144x176',
178         '18': '360x640',
179         '22': '720x1280',
180         '34': '360x640',
181         '35': '480x854',
182         '37': '1080x1920',
183         '38': '3072x4096',
184         '43': '360x640',
185         '44': '480x854',
186         '45': '720x1280',
187         '46': '1080x1920',
188     }
189     IE_NAME = u'youtube'
190
191     def suitable(self, url):
192         """Receives a URL and returns True if suitable for this IE."""
193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195     def report_lang(self):
196         """Report attempt to set language."""
197         self._downloader.to_screen(u'[youtube] Setting language')
198
199     def report_login(self):
200         """Report attempt to log in."""
201         self._downloader.to_screen(u'[youtube] Logging in')
202
203     def report_age_confirmation(self):
204         """Report attempt to confirm age."""
205         self._downloader.to_screen(u'[youtube] Confirming age')
206
207     def report_video_webpage_download(self, video_id):
208         """Report attempt to download video webpage."""
209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211     def report_video_info_webpage_download(self, video_id):
212         """Report attempt to download video info webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215     def report_video_subtitles_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219     def report_video_subtitles_request(self, video_id, lang):
220         """Report attempt to download video info webpage."""
221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
222
223     def report_information_extraction(self, video_id):
224         """Report attempt to extract video information."""
225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227     def report_unavailable_format(self, video_id, format):
228         """Report extracted video URL."""
229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231     def report_rtmp_download(self):
232         """Indicate the download will use the RTMP protocol."""
233         self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235     def _get_available_subtitles(self, video_id):
236         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
237         try:
238             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
239         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
240             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
241         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
242         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
243         if not sub_lang_list:
244             return (u'WARNING: video has no closed captions', None)
245         return sub_lang_list
246
247     def _request_subtitle(self, sub_lang, sub_name, video_id, format = 'srt'):
248         self.report_video_subtitles_request(video_id, sub_lang)
249         params = compat_urllib_parse.urlencode({
250             'lang': sub_lang,
251             'name': sub_name,
252             'v': video_id,
253             'fmt': format,
254         })
255         url = 'http://www.youtube.com/api/timedtext?' + params
256         try:
257             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
258         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
259             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
260         if not sub:
261             return (u'WARNING: Did not fetch video subtitles', None)
262         return (None, sub_lang, sub)
263
264     def _extract_subtitle(self, video_id):
265         self.report_video_subtitles_download(video_id)
266         sub_lang_list = self._get_available_subtitles(video_id)
267
268         if self._downloader.params.get('subtitleslang', False):
269             sub_lang = self._downloader.params.get('subtitleslang')
270         elif 'en' in sub_lang_list:
271             sub_lang = 'en'
272         else:
273             sub_lang = list(sub_lang_list.keys())[0]
274         if not sub_lang in sub_lang_list:
275             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
276
277         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id)
278         return [subtitle]
279
280     def _extract_all_subtitles(self, video_id):
281         self.report_video_subtitles_download(video_id)
282         sub_lang_list = self._get_available_subtitles(video_id)
283         subtitles = []
284         for sub_lang in sub_lang_list:
285             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id)
286             subtitles.append(subtitle)
287         return subtitles
288
289     def _print_formats(self, formats):
290         print('Available formats:')
291         for x in formats:
292             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
293
294     def _real_initialize(self):
295         if self._downloader is None:
296             return
297
298         username = None
299         password = None
300         downloader_params = self._downloader.params
301
302         # Attempt to use provided username and password or .netrc data
303         if downloader_params.get('username', None) is not None:
304             username = downloader_params['username']
305             password = downloader_params['password']
306         elif downloader_params.get('usenetrc', False):
307             try:
308                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
309                 if info is not None:
310                     username = info[0]
311                     password = info[2]
312                 else:
313                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
314             except (IOError, netrc.NetrcParseError) as err:
315                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
316                 return
317
318         # Set language
319         request = compat_urllib_request.Request(self._LANG_URL)
320         try:
321             self.report_lang()
322             compat_urllib_request.urlopen(request).read()
323         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
324             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
325             return
326
327         # No authentication to be performed
328         if username is None:
329             return
330
331         request = compat_urllib_request.Request(self._LOGIN_URL)
332         try:
333             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
334         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
335             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
336             return
337
338         galx = None
339         dsh = None
340         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
341         if match:
342           galx = match.group(1)
343
344         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
345         if match:
346           dsh = match.group(1)
347
348         # Log in
349         login_form_strs = {
350                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
351                 u'Email': username,
352                 u'GALX': galx,
353                 u'Passwd': password,
354                 u'PersistentCookie': u'yes',
355                 u'_utf8': u'霱',
356                 u'bgresponse': u'js_disabled',
357                 u'checkConnection': u'',
358                 u'checkedDomains': u'youtube',
359                 u'dnConn': u'',
360                 u'dsh': dsh,
361                 u'pstMsg': u'0',
362                 u'rmShown': u'1',
363                 u'secTok': u'',
364                 u'signIn': u'Sign in',
365                 u'timeStmp': u'',
366                 u'service': u'youtube',
367                 u'uilel': u'3',
368                 u'hl': u'en_US',
369         }
370         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
371         # chokes on unicode
372         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
373         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
374         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
375         try:
376             self.report_login()
377             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
378             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
379                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
380                 return
381         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
382             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
383             return
384
385         # Confirm age
386         age_form = {
387                 'next_url':     '/',
388                 'action_confirm':   'Confirm',
389                 }
390         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
391         try:
392             self.report_age_confirmation()
393             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
394         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
396             return
397
398     def _extract_id(self, url):
399         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
400         if mobj is None:
401             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
402             return
403         video_id = mobj.group(2)
404         return video_id
405
406     def _real_extract(self, url):
407         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
408         mobj = re.search(self._NEXT_URL_RE, url)
409         if mobj:
410             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
411         video_id = self._extract_id(url)
412
413         # Get video webpage
414         self.report_video_webpage_download(video_id)
415         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
416         request = compat_urllib_request.Request(url)
417         try:
418             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
419         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
420             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
421             return
422
423         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
424
425         # Attempt to extract SWF player URL
426         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
427         if mobj is not None:
428             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
429         else:
430             player_url = None
431
432         # Get video info
433         self.report_video_info_webpage_download(video_id)
434         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
435             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
436                     % (video_id, el_type))
437             request = compat_urllib_request.Request(video_info_url)
438             try:
439                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
440                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
441                 video_info = compat_parse_qs(video_info_webpage)
442                 if 'token' in video_info:
443                     break
444             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
446                 return
447         if 'token' not in video_info:
448             if 'reason' in video_info:
449                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
450             else:
451                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
452             return
453
454         # Check for "rental" videos
455         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
456             self._downloader.trouble(u'ERROR: "rental" videos not supported')
457             return
458
459         # Start extracting information
460         self.report_information_extraction(video_id)
461
462         # uploader
463         if 'author' not in video_info:
464             self._downloader.trouble(u'ERROR: unable to extract uploader name')
465             return
466         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
467
468         # uploader_id
469         video_uploader_id = None
470         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
471         if mobj is not None:
472             video_uploader_id = mobj.group(1)
473         else:
474             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
475
476         # title
477         if 'title' not in video_info:
478             self._downloader.trouble(u'ERROR: unable to extract video title')
479             return
480         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
481
482         # thumbnail image
483         if 'thumbnail_url' not in video_info:
484             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
485             video_thumbnail = ''
486         else:   # don't panic if we can't find it
487             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
488
489         # upload date
490         upload_date = None
491         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
492         if mobj is not None:
493             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
494             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
495             for expression in format_expressions:
496                 try:
497                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
498                 except:
499                     pass
500
501         # description
502         video_description = get_element_by_id("eow-description", video_webpage)
503         if video_description:
504             video_description = clean_html(video_description)
505         else:
506             video_description = ''
507
508         # closed captions
509         video_subtitles = None
510
511         if self._downloader.params.get('writesubtitles', False):
512             video_subtitles = self._extract_subtitle(video_id)
513             if video_subtitles:
514                 (sub_error, sub_lang, sub) = video_subtitles[0]
515                 if sub_error:
516                     self._downloader.trouble(sub_error)
517
518         if self._downloader.params.get('allsubtitles', False):
519             video_subtitles = self._extract_all_subtitles(video_id)
520             for video_subtitle in video_subtitles:
521                 (sub_error, sub_lang, sub) = video_subtitle
522                 if sub_error:
523                     self._downloader.trouble(sub_error)
524
525         if 'length_seconds' not in video_info:
526             self._downloader.trouble(u'WARNING: unable to extract video duration')
527             video_duration = ''
528         else:
529             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
530
531         # token
532         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
533
534         # Decide which formats to download
535         req_format = self._downloader.params.get('format', None)
536
537         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
538             self.report_rtmp_download()
539             video_url_list = [(None, video_info['conn'][0])]
540         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
541             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
542             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
543             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
544             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
545
546             format_limit = self._downloader.params.get('format_limit', None)
547             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
548             if format_limit is not None and format_limit in available_formats:
549                 format_list = available_formats[available_formats.index(format_limit):]
550             else:
551                 format_list = available_formats
552             existing_formats = [x for x in format_list if x in url_map]
553             if len(existing_formats) == 0:
554                 self._downloader.trouble(u'ERROR: no known formats available for video')
555                 return
556             if self._downloader.params.get('listformats', None):
557                 self._print_formats(existing_formats)
558                 return
559             if req_format is None or req_format == 'best':
560                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
561             elif req_format == 'worst':
562                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
563             elif req_format in ('-1', 'all'):
564                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
565             else:
566                 # Specific formats. We pick the first in a slash-delimeted sequence.
567                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
568                 req_formats = req_format.split('/')
569                 video_url_list = None
570                 for rf in req_formats:
571                     if rf in url_map:
572                         video_url_list = [(rf, url_map[rf])]
573                         break
574                 if video_url_list is None:
575                     self._downloader.trouble(u'ERROR: requested format not available')
576                     return
577         else:
578             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
579             return
580
581         results = []
582         for format_param, video_real_url in video_url_list:
583             # Extension
584             video_extension = self._video_extensions.get(format_param, 'flv')
585
586             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
587                                               self._video_dimensions.get(format_param, '???'))
588
589             results.append({
590                 'id':       video_id,
591                 'url':      video_real_url,
592                 'uploader': video_uploader,
593                 'uploader_id': video_uploader_id,
594                 'upload_date':  upload_date,
595                 'title':    video_title,
596                 'ext':      video_extension,
597                 'format':   video_format,
598                 'thumbnail':    video_thumbnail,
599                 'description':  video_description,
600                 'player_url':   player_url,
601                 'subtitles':    video_subtitles,
602                 'duration':     video_duration
603             })
604         return results
605
606
607 class MetacafeIE(InfoExtractor):
608     """Information Extractor for metacafe.com."""
609
610     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
611     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
612     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
613     IE_NAME = u'metacafe'
614
615     def __init__(self, downloader=None):
616         InfoExtractor.__init__(self, downloader)
617
618     def report_disclaimer(self):
619         """Report disclaimer retrieval."""
620         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
621
622     def report_age_confirmation(self):
623         """Report attempt to confirm age."""
624         self._downloader.to_screen(u'[metacafe] Confirming age')
625
626     def report_download_webpage(self, video_id):
627         """Report webpage download."""
628         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
629
630     def report_extraction(self, video_id):
631         """Report information extraction."""
632         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
633
634     def _real_initialize(self):
635         # Retrieve disclaimer
636         request = compat_urllib_request.Request(self._DISCLAIMER)
637         try:
638             self.report_disclaimer()
639             disclaimer = compat_urllib_request.urlopen(request).read()
640         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
641             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
642             return
643
644         # Confirm age
645         disclaimer_form = {
646             'filters': '0',
647             'submit': "Continue - I'm over 18",
648             }
649         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
650         try:
651             self.report_age_confirmation()
652             disclaimer = compat_urllib_request.urlopen(request).read()
653         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
654             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
655             return
656
657     def _real_extract(self, url):
658         # Extract id and simplified title from URL
659         mobj = re.match(self._VALID_URL, url)
660         if mobj is None:
661             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
662             return
663
664         video_id = mobj.group(1)
665
666         # Check if video comes from YouTube
667         mobj2 = re.match(r'^yt-(.*)$', video_id)
668         if mobj2 is not None:
669             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
670             return
671
672         # Retrieve video webpage to extract further information
673         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
674         try:
675             self.report_download_webpage(video_id)
676             webpage = compat_urllib_request.urlopen(request).read()
677         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
678             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
679             return
680
681         # Extract URL, uploader and title from webpage
682         self.report_extraction(video_id)
683         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
684         if mobj is not None:
685             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
686             video_extension = mediaURL[-3:]
687
688             # Extract gdaKey if available
689             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
690             if mobj is None:
691                 video_url = mediaURL
692             else:
693                 gdaKey = mobj.group(1)
694                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
695         else:
696             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
697             if mobj is None:
698                 self._downloader.trouble(u'ERROR: unable to extract media URL')
699                 return
700             vardict = compat_parse_qs(mobj.group(1))
701             if 'mediaData' not in vardict:
702                 self._downloader.trouble(u'ERROR: unable to extract media URL')
703                 return
704             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
705             if mobj is None:
706                 self._downloader.trouble(u'ERROR: unable to extract media URL')
707                 return
708             mediaURL = mobj.group(1).replace('\\/', '/')
709             video_extension = mediaURL[-3:]
710             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
711
712         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
713         if mobj is None:
714             self._downloader.trouble(u'ERROR: unable to extract title')
715             return
716         video_title = mobj.group(1).decode('utf-8')
717
718         mobj = re.search(r'submitter=(.*?);', webpage)
719         if mobj is None:
720             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
721             return
722         video_uploader = mobj.group(1)
723
724         return [{
725             'id':       video_id.decode('utf-8'),
726             'url':      video_url.decode('utf-8'),
727             'uploader': video_uploader.decode('utf-8'),
728             'upload_date':  None,
729             'title':    video_title,
730             'ext':      video_extension.decode('utf-8'),
731         }]
732
733
734 class DailymotionIE(InfoExtractor):
735     """Information Extractor for Dailymotion"""
736
737     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
738     IE_NAME = u'dailymotion'
739     _WORKING = False
740
741     def __init__(self, downloader=None):
742         InfoExtractor.__init__(self, downloader)
743
744     def report_extraction(self, video_id):
745         """Report information extraction."""
746         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
747
748     def _real_extract(self, url):
749         # Extract id and simplified title from URL
750         mobj = re.match(self._VALID_URL, url)
751         if mobj is None:
752             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
753             return
754
755         video_id = mobj.group(1).split('_')[0].split('?')[0]
756
757         video_extension = 'mp4'
758
759         # Retrieve video webpage to extract further information
760         request = compat_urllib_request.Request(url)
761         request.add_header('Cookie', 'family_filter=off')
762         webpage = self._download_webpage(request, video_id)
763
764         # Extract URL, uploader and title from webpage
765         self.report_extraction(video_id)
766         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
767         if mobj is None:
768             self._downloader.trouble(u'ERROR: unable to extract media URL')
769             return
770         flashvars = compat_urllib_parse.unquote(mobj.group(1))
771
772         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
773             if key in flashvars:
774                 max_quality = key
775                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
776                 break
777         else:
778             self._downloader.trouble(u'ERROR: unable to extract video URL')
779             return
780
781         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
782         if mobj is None:
783             self._downloader.trouble(u'ERROR: unable to extract video URL')
784             return
785
786         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
787
788         # TODO: support choosing qualities
789
790         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
791         if mobj is None:
792             self._downloader.trouble(u'ERROR: unable to extract title')
793             return
794         video_title = unescapeHTML(mobj.group('title'))
795
796         video_uploader = None
797         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
798         if mobj is None:
799             # lookin for official user
800             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
801             if mobj_official is None:
802                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
803             else:
804                 video_uploader = mobj_official.group(1)
805         else:
806             video_uploader = mobj.group(1)
807
808         video_upload_date = None
809         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
810         if mobj is not None:
811             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
812
813         return [{
814             'id':       video_id,
815             'url':      video_url,
816             'uploader': video_uploader,
817             'upload_date':  video_upload_date,
818             'title':    video_title,
819             'ext':      video_extension,
820         }]
821
822
823 class PhotobucketIE(InfoExtractor):
824     """Information extractor for photobucket.com."""
825
826     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
827     IE_NAME = u'photobucket'
828
829     def __init__(self, downloader=None):
830         InfoExtractor.__init__(self, downloader)
831
832     def report_download_webpage(self, video_id):
833         """Report webpage download."""
834         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
835
836     def report_extraction(self, video_id):
837         """Report information extraction."""
838         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
839
840     def _real_extract(self, url):
841         # Extract id from URL
842         mobj = re.match(self._VALID_URL, url)
843         if mobj is None:
844             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
845             return
846
847         video_id = mobj.group(1)
848
849         video_extension = 'flv'
850
851         # Retrieve video webpage to extract further information
852         request = compat_urllib_request.Request(url)
853         try:
854             self.report_download_webpage(video_id)
855             webpage = compat_urllib_request.urlopen(request).read()
856         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
857             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
858             return
859
860         # Extract URL, uploader, and title from webpage
861         self.report_extraction(video_id)
862         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
863         if mobj is None:
864             self._downloader.trouble(u'ERROR: unable to extract media URL')
865             return
866         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
867
868         video_url = mediaURL
869
870         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
871         if mobj is None:
872             self._downloader.trouble(u'ERROR: unable to extract title')
873             return
874         video_title = mobj.group(1).decode('utf-8')
875
876         video_uploader = mobj.group(2).decode('utf-8')
877
878         return [{
879             'id':       video_id.decode('utf-8'),
880             'url':      video_url.decode('utf-8'),
881             'uploader': video_uploader,
882             'upload_date':  None,
883             'title':    video_title,
884             'ext':      video_extension.decode('utf-8'),
885         }]
886
887
888 class YahooIE(InfoExtractor):
889     """Information extractor for video.yahoo.com."""
890
891     _WORKING = False
892     # _VALID_URL matches all Yahoo! Video URLs
893     # _VPAGE_URL matches only the extractable '/watch/' URLs
894     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
895     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
896     IE_NAME = u'video.yahoo'
897
898     def __init__(self, downloader=None):
899         InfoExtractor.__init__(self, downloader)
900
901     def report_download_webpage(self, video_id):
902         """Report webpage download."""
903         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
904
905     def report_extraction(self, video_id):
906         """Report information extraction."""
907         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
908
909     def _real_extract(self, url, new_video=True):
910         # Extract ID from URL
911         mobj = re.match(self._VALID_URL, url)
912         if mobj is None:
913             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914             return
915
916         video_id = mobj.group(2)
917         video_extension = 'flv'
918
919         # Rewrite valid but non-extractable URLs as
920         # extractable English language /watch/ URLs
921         if re.match(self._VPAGE_URL, url) is None:
922             request = compat_urllib_request.Request(url)
923             try:
924                 webpage = compat_urllib_request.urlopen(request).read()
925             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
926                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
927                 return
928
929             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
930             if mobj is None:
931                 self._downloader.trouble(u'ERROR: Unable to extract id field')
932                 return
933             yahoo_id = mobj.group(1)
934
935             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
936             if mobj is None:
937                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
938                 return
939             yahoo_vid = mobj.group(1)
940
941             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
942             return self._real_extract(url, new_video=False)
943
944         # Retrieve video webpage to extract further information
945         request = compat_urllib_request.Request(url)
946         try:
947             self.report_download_webpage(video_id)
948             webpage = compat_urllib_request.urlopen(request).read()
949         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
950             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951             return
952
953         # Extract uploader and title from webpage
954         self.report_extraction(video_id)
955         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
956         if mobj is None:
957             self._downloader.trouble(u'ERROR: unable to extract video title')
958             return
959         video_title = mobj.group(1).decode('utf-8')
960
961         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: unable to extract video uploader')
964             return
965         video_uploader = mobj.group(1).decode('utf-8')
966
967         # Extract video thumbnail
968         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
969         if mobj is None:
970             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
971             return
972         video_thumbnail = mobj.group(1).decode('utf-8')
973
974         # Extract video description
975         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
976         if mobj is None:
977             self._downloader.trouble(u'ERROR: unable to extract video description')
978             return
979         video_description = mobj.group(1).decode('utf-8')
980         if not video_description:
981             video_description = 'No description available.'
982
983         # Extract video height and width
984         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
985         if mobj is None:
986             self._downloader.trouble(u'ERROR: unable to extract video height')
987             return
988         yv_video_height = mobj.group(1)
989
990         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
991         if mobj is None:
992             self._downloader.trouble(u'ERROR: unable to extract video width')
993             return
994         yv_video_width = mobj.group(1)
995
996         # Retrieve video playlist to extract media URL
997         # I'm not completely sure what all these options are, but we
998         # seem to need most of them, otherwise the server sends a 401.
999         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1000         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1001         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1002                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1003                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1004         try:
1005             self.report_download_webpage(video_id)
1006             webpage = compat_urllib_request.urlopen(request).read()
1007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009             return
1010
1011         # Extract media URL from playlist XML
1012         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1013         if mobj is None:
1014             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1015             return
1016         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1017         video_url = unescapeHTML(video_url)
1018
1019         return [{
1020             'id':       video_id.decode('utf-8'),
1021             'url':      video_url,
1022             'uploader': video_uploader,
1023             'upload_date':  None,
1024             'title':    video_title,
1025             'ext':      video_extension.decode('utf-8'),
1026             'thumbnail':    video_thumbnail.decode('utf-8'),
1027             'description':  video_description,
1028         }]
1029
1030
1031 class VimeoIE(InfoExtractor):
1032     """Information extractor for vimeo.com."""
1033
1034     # _VALID_URL matches Vimeo URLs
1035     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1036     IE_NAME = u'vimeo'
1037
1038     def __init__(self, downloader=None):
1039         InfoExtractor.__init__(self, downloader)
1040
1041     def report_download_webpage(self, video_id):
1042         """Report webpage download."""
1043         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1044
1045     def report_extraction(self, video_id):
1046         """Report information extraction."""
1047         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1048
1049     def _real_extract(self, url, new_video=True):
1050         # Extract ID from URL
1051         mobj = re.match(self._VALID_URL, url)
1052         if mobj is None:
1053             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054             return
1055
1056         video_id = mobj.group('id')
1057         if not mobj.group('proto'):
1058             url = 'https://' + url
1059         if mobj.group('direct_link'):
1060             url = 'https://vimeo.com/' + video_id
1061
1062         # Retrieve video webpage to extract further information
1063         request = compat_urllib_request.Request(url, None, std_headers)
1064         try:
1065             self.report_download_webpage(video_id)
1066             webpage_bytes = compat_urllib_request.urlopen(request).read()
1067             webpage = webpage_bytes.decode('utf-8')
1068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1069             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1070             return
1071
1072         # Now we begin extracting as much information as we can from what we
1073         # retrieved. First we extract the information common to all extractors,
1074         # and latter we extract those that are Vimeo specific.
1075         self.report_extraction(video_id)
1076
1077         # Extract the config JSON
1078         try:
1079             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1080             config = json.loads(config)
1081         except:
1082             self._downloader.trouble(u'ERROR: unable to extract info section')
1083             return
1084
1085         # Extract title
1086         video_title = config["video"]["title"]
1087
1088         # Extract uploader and uploader_id
1089         video_uploader = config["video"]["owner"]["name"]
1090         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1091
1092         # Extract video thumbnail
1093         video_thumbnail = config["video"]["thumbnail"]
1094
1095         # Extract video description
1096         video_description = get_element_by_attribute("itemprop", "description", webpage)
1097         if video_description: video_description = clean_html(video_description)
1098         else: video_description = ''
1099
1100         # Extract upload date
1101         video_upload_date = None
1102         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1103         if mobj is not None:
1104             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1105
1106         # Vimeo specific: extract request signature and timestamp
1107         sig = config['request']['signature']
1108         timestamp = config['request']['timestamp']
1109
1110         # Vimeo specific: extract video codec and quality information
1111         # First consider quality, then codecs, then take everything
1112         # TODO bind to format param
1113         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1114         files = { 'hd': [], 'sd': [], 'other': []}
1115         for codec_name, codec_extension in codecs:
1116             if codec_name in config["video"]["files"]:
1117                 if 'hd' in config["video"]["files"][codec_name]:
1118                     files['hd'].append((codec_name, codec_extension, 'hd'))
1119                 elif 'sd' in config["video"]["files"][codec_name]:
1120                     files['sd'].append((codec_name, codec_extension, 'sd'))
1121                 else:
1122                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1123
1124         for quality in ('hd', 'sd', 'other'):
1125             if len(files[quality]) > 0:
1126                 video_quality = files[quality][0][2]
1127                 video_codec = files[quality][0][0]
1128                 video_extension = files[quality][0][1]
1129                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1130                 break
1131         else:
1132             self._downloader.trouble(u'ERROR: no known codec found')
1133             return
1134
1135         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1136                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1137
1138         return [{
1139             'id':       video_id,
1140             'url':      video_url,
1141             'uploader': video_uploader,
1142             'uploader_id': video_uploader_id,
1143             'upload_date':  video_upload_date,
1144             'title':    video_title,
1145             'ext':      video_extension,
1146             'thumbnail':    video_thumbnail,
1147             'description':  video_description,
1148         }]
1149
1150
1151 class ArteTvIE(InfoExtractor):
1152     """arte.tv information extractor."""
1153
1154     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1155     _LIVE_URL = r'index-[0-9]+\.html$'
1156
1157     IE_NAME = u'arte.tv'
1158
1159     def __init__(self, downloader=None):
1160         InfoExtractor.__init__(self, downloader)
1161
1162     def report_download_webpage(self, video_id):
1163         """Report webpage download."""
1164         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1165
1166     def report_extraction(self, video_id):
1167         """Report information extraction."""
1168         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1169
1170     def fetch_webpage(self, url):
1171         request = compat_urllib_request.Request(url)
1172         try:
1173             self.report_download_webpage(url)
1174             webpage = compat_urllib_request.urlopen(request).read()
1175         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1177             return
1178         except ValueError as err:
1179             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180             return
1181         return webpage
1182
1183     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184         page = self.fetch_webpage(url)
1185         mobj = re.search(regex, page, regexFlags)
1186         info = {}
1187
1188         if mobj is None:
1189             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190             return
1191
1192         for (i, key, err) in matchTuples:
1193             if mobj.group(i) is None:
1194                 self._downloader.trouble(err)
1195                 return
1196             else:
1197                 info[key] = mobj.group(i)
1198
1199         return info
1200
1201     def extractLiveStream(self, url):
1202         video_lang = url.split('/')[-4]
1203         info = self.grep_webpage(
1204             url,
1205             r'src="(.*?/videothek_js.*?\.js)',
1206             0,
1207             [
1208                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1209             ]
1210         )
1211         http_host = url.split('/')[2]
1212         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1213         info = self.grep_webpage(
1214             next_url,
1215             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216                 '(http://.*?\.swf).*?' +
1217                 '(rtmp://.*?)\'',
1218             re.DOTALL,
1219             [
1220                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1221                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1222                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1223             ]
1224         )
1225         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1226
1227     def extractPlus7Stream(self, url):
1228         video_lang = url.split('/')[-3]
1229         info = self.grep_webpage(
1230             url,
1231             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232             0,
1233             [
1234                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1235             ]
1236         )
1237         next_url = compat_urllib_parse.unquote(info.get('url'))
1238         info = self.grep_webpage(
1239             next_url,
1240             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241             0,
1242             [
1243                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1244             ]
1245         )
1246         next_url = compat_urllib_parse.unquote(info.get('url'))
1247
1248         info = self.grep_webpage(
1249             next_url,
1250             r'<video id="(.*?)".*?>.*?' +
1251                 '<name>(.*?)</name>.*?' +
1252                 '<dateVideo>(.*?)</dateVideo>.*?' +
1253                 '<url quality="hd">(.*?)</url>',
1254             re.DOTALL,
1255             [
1256                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1257                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1258                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1259                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1260             ]
1261         )
1262
1263         return {
1264             'id':           info.get('id'),
1265             'url':          compat_urllib_parse.unquote(info.get('url')),
1266             'uploader':     u'arte.tv',
1267             'upload_date':  info.get('date'),
1268             'title':        info.get('title').decode('utf-8'),
1269             'ext':          u'mp4',
1270             'format':       u'NA',
1271             'player_url':   None,
1272         }
1273
1274     def _real_extract(self, url):
1275         video_id = url.split('/')[-1]
1276         self.report_extraction(video_id)
1277
1278         if re.search(self._LIVE_URL, video_id) is not None:
1279             self.extractLiveStream(url)
1280             return
1281         else:
1282             info = self.extractPlus7Stream(url)
1283
1284         return [info]
1285
1286
1287 class GenericIE(InfoExtractor):
1288     """Generic last-resort information extractor."""
1289
1290     _VALID_URL = r'.*'
1291     IE_NAME = u'generic'
1292
1293     def __init__(self, downloader=None):
1294         InfoExtractor.__init__(self, downloader)
1295
1296     def report_download_webpage(self, video_id):
1297         """Report webpage download."""
1298         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1299         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1300
1301     def report_extraction(self, video_id):
1302         """Report information extraction."""
1303         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1304
1305     def report_following_redirect(self, new_url):
1306         """Report information extraction."""
1307         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309     def _test_redirect(self, url):
1310         """Check if it is a redirect, like url shorteners, in case restart chain."""
1311         class HeadRequest(compat_urllib_request.Request):
1312             def get_method(self):
1313                 return "HEAD"
1314
1315         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316             """
1317             Subclass the HTTPRedirectHandler to make it use our
1318             HeadRequest also on the redirected URL
1319             """
1320             def redirect_request(self, req, fp, code, msg, headers, newurl):
1321                 if code in (301, 302, 303, 307):
1322                     newurl = newurl.replace(' ', '%20')
1323                     newheaders = dict((k,v) for k,v in req.headers.items()
1324                                       if k.lower() not in ("content-length", "content-type"))
1325                     return HeadRequest(newurl,
1326                                        headers=newheaders,
1327                                        origin_req_host=req.get_origin_req_host(),
1328                                        unverifiable=True)
1329                 else:
1330                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333             """
1334             Fallback to GET if HEAD is not allowed (405 HTTP error)
1335             """
1336             def http_error_405(self, req, fp, code, msg, headers):
1337                 fp.read()
1338                 fp.close()
1339
1340                 newheaders = dict((k,v) for k,v in req.headers.items()
1341                                   if k.lower() not in ("content-length", "content-type"))
1342                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343                                                  headers=newheaders,
1344                                                  origin_req_host=req.get_origin_req_host(),
1345                                                  unverifiable=True))
1346
1347         # Build our opener
1348         opener = compat_urllib_request.OpenerDirector()
1349         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350                         HTTPMethodFallback, HEADRedirectHandler,
1351                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352             opener.add_handler(handler())
1353
1354         response = opener.open(HeadRequest(url))
1355         new_url = response.geturl()
1356
1357         if url == new_url:
1358             return False
1359
1360         self.report_following_redirect(new_url)
1361         self._downloader.download([new_url])
1362         return True
1363
1364     def _real_extract(self, url):
1365         if self._test_redirect(url): return
1366
1367         video_id = url.split('/')[-1]
1368         request = compat_urllib_request.Request(url)
1369         try:
1370             self.report_download_webpage(video_id)
1371             webpage = compat_urllib_request.urlopen(request).read()
1372         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1373             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1374             return
1375         except ValueError as err:
1376             # since this is the last-resort InfoExtractor, if
1377             # this error is thrown, it'll be thrown here
1378             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379             return
1380
1381         self.report_extraction(video_id)
1382         # Start with something easy: JW Player in SWFObject
1383         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1384         if mobj is None:
1385             # Broaden the search a little bit
1386             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1387         if mobj is None:
1388             # Broaden the search a little bit: JWPlayer JS loader
1389             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1390         if mobj is None:
1391             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1392             return
1393
1394         # It's possible that one of the regexes
1395         # matched, but returned an empty group:
1396         if mobj.group(1) is None:
1397             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1398             return
1399
1400         video_url = compat_urllib_parse.unquote(mobj.group(1))
1401         video_id = os.path.basename(video_url)
1402
1403         # here's a fun little line of code for you:
1404         video_extension = os.path.splitext(video_id)[1][1:]
1405         video_id = os.path.splitext(video_id)[0]
1406
1407         # it's tempting to parse this further, but you would
1408         # have to take into account all the variations like
1409         #   Video Title - Site Name
1410         #   Site Name | Video Title
1411         #   Video Title - Tagline | Site Name
1412         # and so on and so forth; it's just not practical
1413         mobj = re.search(r'<title>(.*)</title>', webpage)
1414         if mobj is None:
1415             self._downloader.trouble(u'ERROR: unable to extract title')
1416             return
1417         video_title = mobj.group(1)
1418
1419         # video uploader is domain name
1420         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1421         if mobj is None:
1422             self._downloader.trouble(u'ERROR: unable to extract title')
1423             return
1424         video_uploader = mobj.group(1)
1425
1426         return [{
1427             'id':       video_id,
1428             'url':      video_url,
1429             'uploader': video_uploader,
1430             'upload_date':  None,
1431             'title':    video_title,
1432             'ext':      video_extension,
1433         }]
1434
1435
1436 class YoutubeSearchIE(InfoExtractor):
1437     """Information Extractor for YouTube search queries."""
1438     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1439     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1440     _max_youtube_results = 1000
1441     IE_NAME = u'youtube:search'
1442
1443     def __init__(self, downloader=None):
1444         InfoExtractor.__init__(self, downloader)
1445
1446     def report_download_page(self, query, pagenum):
1447         """Report attempt to download search page with given number."""
1448         query = query.decode(preferredencoding())
1449         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1450
1451     def _real_extract(self, query):
1452         mobj = re.match(self._VALID_URL, query)
1453         if mobj is None:
1454             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455             return
1456
1457         prefix, query = query.split(':')
1458         prefix = prefix[8:]
1459         query = query.encode('utf-8')
1460         if prefix == '':
1461             self._download_n_results(query, 1)
1462             return
1463         elif prefix == 'all':
1464             self._download_n_results(query, self._max_youtube_results)
1465             return
1466         else:
1467             try:
1468                 n = int(prefix)
1469                 if n <= 0:
1470                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471                     return
1472                 elif n > self._max_youtube_results:
1473                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1474                     n = self._max_youtube_results
1475                 self._download_n_results(query, n)
1476                 return
1477             except ValueError: # parsing prefix as integer fails
1478                 self._download_n_results(query, 1)
1479                 return
1480
1481     def _download_n_results(self, query, n):
1482         """Downloads a specified number of results for a query"""
1483
1484         video_ids = []
1485         pagenum = 0
1486         limit = n
1487
1488         while (50 * pagenum) < limit:
1489             self.report_download_page(query, pagenum+1)
1490             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1491             request = compat_urllib_request.Request(result_url)
1492             try:
1493                 data = compat_urllib_request.urlopen(request).read()
1494             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1495                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1496                 return
1497             api_response = json.loads(data)['data']
1498
1499             new_ids = list(video['id'] for video in api_response['items'])
1500             video_ids += new_ids
1501
1502             limit = min(n, api_response['totalItems'])
1503             pagenum += 1
1504
1505         if len(video_ids) > n:
1506             video_ids = video_ids[:n]
1507         for id in video_ids:
1508             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1509         return
1510
1511
1512 class GoogleSearchIE(InfoExtractor):
1513     """Information Extractor for Google Video search queries."""
1514     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1515     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1516     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1517     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1518     _max_google_results = 1000
1519     IE_NAME = u'video.google:search'
1520
1521     def __init__(self, downloader=None):
1522         InfoExtractor.__init__(self, downloader)
1523
1524     def report_download_page(self, query, pagenum):
1525         """Report attempt to download playlist page with given number."""
1526         query = query.decode(preferredencoding())
1527         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1528
1529     def _real_extract(self, query):
1530         mobj = re.match(self._VALID_URL, query)
1531         if mobj is None:
1532             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1533             return
1534
1535         prefix, query = query.split(':')
1536         prefix = prefix[8:]
1537         query = query.encode('utf-8')
1538         if prefix == '':
1539             self._download_n_results(query, 1)
1540             return
1541         elif prefix == 'all':
1542             self._download_n_results(query, self._max_google_results)
1543             return
1544         else:
1545             try:
1546                 n = int(prefix)
1547                 if n <= 0:
1548                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1549                     return
1550                 elif n > self._max_google_results:
1551                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1552                     n = self._max_google_results
1553                 self._download_n_results(query, n)
1554                 return
1555             except ValueError: # parsing prefix as integer fails
1556                 self._download_n_results(query, 1)
1557                 return
1558
1559     def _download_n_results(self, query, n):
1560         """Downloads a specified number of results for a query"""
1561
1562         video_ids = []
1563         pagenum = 0
1564
1565         while True:
1566             self.report_download_page(query, pagenum)
1567             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1568             request = compat_urllib_request.Request(result_url)
1569             try:
1570                 page = compat_urllib_request.urlopen(request).read()
1571             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1572                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1573                 return
1574
1575             # Extract video identifiers
1576             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1577                 video_id = mobj.group(1)
1578                 if video_id not in video_ids:
1579                     video_ids.append(video_id)
1580                     if len(video_ids) == n:
1581                         # Specified n videos reached
1582                         for id in video_ids:
1583                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1584                         return
1585
1586             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1587                 for id in video_ids:
1588                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1589                 return
1590
1591             pagenum = pagenum + 1
1592
1593
1594 class YahooSearchIE(InfoExtractor):
1595     """Information Extractor for Yahoo! Video search queries."""
1596
1597     _WORKING = False
1598     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1599     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1600     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1601     _MORE_PAGES_INDICATOR = r'\s*Next'
1602     _max_yahoo_results = 1000
1603     IE_NAME = u'video.yahoo:search'
1604
1605     def __init__(self, downloader=None):
1606         InfoExtractor.__init__(self, downloader)
1607
1608     def report_download_page(self, query, pagenum):
1609         """Report attempt to download playlist page with given number."""
1610         query = query.decode(preferredencoding())
1611         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1612
1613     def _real_extract(self, query):
1614         mobj = re.match(self._VALID_URL, query)
1615         if mobj is None:
1616             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1617             return
1618
1619         prefix, query = query.split(':')
1620         prefix = prefix[8:]
1621         query = query.encode('utf-8')
1622         if prefix == '':
1623             self._download_n_results(query, 1)
1624             return
1625         elif prefix == 'all':
1626             self._download_n_results(query, self._max_yahoo_results)
1627             return
1628         else:
1629             try:
1630                 n = int(prefix)
1631                 if n <= 0:
1632                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1633                     return
1634                 elif n > self._max_yahoo_results:
1635                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1636                     n = self._max_yahoo_results
1637                 self._download_n_results(query, n)
1638                 return
1639             except ValueError: # parsing prefix as integer fails
1640                 self._download_n_results(query, 1)
1641                 return
1642
1643     def _download_n_results(self, query, n):
1644         """Downloads a specified number of results for a query"""
1645
1646         video_ids = []
1647         already_seen = set()
1648         pagenum = 1
1649
1650         while True:
1651             self.report_download_page(query, pagenum)
1652             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1653             request = compat_urllib_request.Request(result_url)
1654             try:
1655                 page = compat_urllib_request.urlopen(request).read()
1656             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1657                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1658                 return
1659
1660             # Extract video identifiers
1661             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1662                 video_id = mobj.group(1)
1663                 if video_id not in already_seen:
1664                     video_ids.append(video_id)
1665                     already_seen.add(video_id)
1666                     if len(video_ids) == n:
1667                         # Specified n videos reached
1668                         for id in video_ids:
1669                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1670                         return
1671
1672             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1673                 for id in video_ids:
1674                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1675                 return
1676
1677             pagenum = pagenum + 1
1678
1679
1680 class YoutubePlaylistIE(InfoExtractor):
1681     """Information Extractor for YouTube playlists."""
1682
1683     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1684     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1685     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1686     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1687     IE_NAME = u'youtube:playlist'
1688
1689     def __init__(self, downloader=None):
1690         InfoExtractor.__init__(self, downloader)
1691
1692     def report_download_page(self, playlist_id, pagenum):
1693         """Report attempt to download playlist page with given number."""
1694         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1695
1696     def _real_extract(self, url):
1697         # Extract playlist id
1698         mobj = re.match(self._VALID_URL, url)
1699         if mobj is None:
1700             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1701             return
1702
1703         # Single video case
1704         if mobj.group(3) is not None:
1705             self._downloader.download([mobj.group(3)])
1706             return
1707
1708         # Download playlist pages
1709         # prefix is 'p' as default for playlists but there are other types that need extra care
1710         playlist_prefix = mobj.group(1)
1711         if playlist_prefix == 'a':
1712             playlist_access = 'artist'
1713         else:
1714             playlist_prefix = 'p'
1715             playlist_access = 'view_play_list'
1716         playlist_id = mobj.group(2)
1717         video_ids = []
1718         pagenum = 1
1719
1720         while True:
1721             self.report_download_page(playlist_id, pagenum)
1722             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1723             request = compat_urllib_request.Request(url)
1724             try:
1725                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1726             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728                 return
1729
1730             # Extract video identifiers
1731             ids_in_page = []
1732             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1733                 if mobj.group(1) not in ids_in_page:
1734                     ids_in_page.append(mobj.group(1))
1735             video_ids.extend(ids_in_page)
1736
1737             if self._MORE_PAGES_INDICATOR not in page:
1738                 break
1739             pagenum = pagenum + 1
1740
1741         total = len(video_ids)
1742
1743         playliststart = self._downloader.params.get('playliststart', 1) - 1
1744         playlistend = self._downloader.params.get('playlistend', -1)
1745         if playlistend == -1:
1746             video_ids = video_ids[playliststart:]
1747         else:
1748             video_ids = video_ids[playliststart:playlistend]
1749
1750         if len(video_ids) == total:
1751             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1752         else:
1753             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1754
1755         for id in video_ids:
1756             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1757         return
1758
1759
1760 class YoutubeChannelIE(InfoExtractor):
1761     """Information Extractor for YouTube channels."""
1762
1763     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1764     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1765     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1766     IE_NAME = u'youtube:channel'
1767
1768     def report_download_page(self, channel_id, pagenum):
1769         """Report attempt to download channel page with given number."""
1770         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1771
1772     def _real_extract(self, url):
1773         # Extract channel id
1774         mobj = re.match(self._VALID_URL, url)
1775         if mobj is None:
1776             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1777             return
1778
1779         # Download channel pages
1780         channel_id = mobj.group(1)
1781         video_ids = []
1782         pagenum = 1
1783
1784         while True:
1785             self.report_download_page(channel_id, pagenum)
1786             url = self._TEMPLATE_URL % (channel_id, pagenum)
1787             request = compat_urllib_request.Request(url)
1788             try:
1789                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1790             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1791                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1792                 return
1793
1794             # Extract video identifiers
1795             ids_in_page = []
1796             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1797                 if mobj.group(1) not in ids_in_page:
1798                     ids_in_page.append(mobj.group(1))
1799             video_ids.extend(ids_in_page)
1800
1801             if self._MORE_PAGES_INDICATOR not in page:
1802                 break
1803             pagenum = pagenum + 1
1804
1805         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1806
1807         for id in video_ids:
1808             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1809         return
1810
1811
1812 class YoutubeUserIE(InfoExtractor):
1813     """Information Extractor for YouTube users."""
1814
1815     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1816     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1817     _GDATA_PAGE_SIZE = 50
1818     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1819     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1820     IE_NAME = u'youtube:user'
1821
1822     def __init__(self, downloader=None):
1823         InfoExtractor.__init__(self, downloader)
1824
1825     def report_download_page(self, username, start_index):
1826         """Report attempt to download user page."""
1827         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1828                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1829
1830     def _real_extract(self, url):
1831         # Extract username
1832         mobj = re.match(self._VALID_URL, url)
1833         if mobj is None:
1834             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1835             return
1836
1837         username = mobj.group(1)
1838
1839         # Download video ids using YouTube Data API. Result size per
1840         # query is limited (currently to 50 videos) so we need to query
1841         # page by page until there are no video ids - it means we got
1842         # all of them.
1843
1844         video_ids = []
1845         pagenum = 0
1846
1847         while True:
1848             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1849             self.report_download_page(username, start_index)
1850
1851             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1852
1853             try:
1854                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1855             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1857                 return
1858
1859             # Extract video identifiers
1860             ids_in_page = []
1861
1862             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1863                 if mobj.group(1) not in ids_in_page:
1864                     ids_in_page.append(mobj.group(1))
1865
1866             video_ids.extend(ids_in_page)
1867
1868             # A little optimization - if current page is not
1869             # "full", ie. does not contain PAGE_SIZE video ids then
1870             # we can assume that this page is the last one - there
1871             # are no more ids on further pages - no need to query
1872             # again.
1873
1874             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1875                 break
1876
1877             pagenum += 1
1878
1879         all_ids_count = len(video_ids)
1880         playliststart = self._downloader.params.get('playliststart', 1) - 1
1881         playlistend = self._downloader.params.get('playlistend', -1)
1882
1883         if playlistend == -1:
1884             video_ids = video_ids[playliststart:]
1885         else:
1886             video_ids = video_ids[playliststart:playlistend]
1887
1888         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1889                 (username, all_ids_count, len(video_ids)))
1890
1891         for video_id in video_ids:
1892             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1893
1894
1895 class BlipTVUserIE(InfoExtractor):
1896     """Information Extractor for blip.tv users."""
1897
1898     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1899     _PAGE_SIZE = 12
1900     IE_NAME = u'blip.tv:user'
1901
1902     def __init__(self, downloader=None):
1903         InfoExtractor.__init__(self, downloader)
1904
1905     def report_download_page(self, username, pagenum):
1906         """Report attempt to download user page."""
1907         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1908                 (self.IE_NAME, username, pagenum))
1909
1910     def _real_extract(self, url):
1911         # Extract username
1912         mobj = re.match(self._VALID_URL, url)
1913         if mobj is None:
1914             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1915             return
1916
1917         username = mobj.group(1)
1918
1919         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1920
1921         request = compat_urllib_request.Request(url)
1922
1923         try:
1924             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925             mobj = re.search(r'data-users-id="([^"]+)"', page)
1926             page_base = page_base % mobj.group(1)
1927         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1929             return
1930
1931
1932         # Download video ids using BlipTV Ajax calls. Result size per
1933         # query is limited (currently to 12 videos) so we need to query
1934         # page by page until there are no video ids - it means we got
1935         # all of them.
1936
1937         video_ids = []
1938         pagenum = 1
1939
1940         while True:
1941             self.report_download_page(username, pagenum)
1942
1943             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1944
1945             try:
1946                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1949                 return
1950
1951             # Extract video identifiers
1952             ids_in_page = []
1953
1954             for mobj in re.finditer(r'href="/([^"]+)"', page):
1955                 if mobj.group(1) not in ids_in_page:
1956                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1957
1958             video_ids.extend(ids_in_page)
1959
1960             # A little optimization - if current page is not
1961             # "full", ie. does not contain PAGE_SIZE video ids then
1962             # we can assume that this page is the last one - there
1963             # are no more ids on further pages - no need to query
1964             # again.
1965
1966             if len(ids_in_page) < self._PAGE_SIZE:
1967                 break
1968
1969             pagenum += 1
1970
1971         all_ids_count = len(video_ids)
1972         playliststart = self._downloader.params.get('playliststart', 1) - 1
1973         playlistend = self._downloader.params.get('playlistend', -1)
1974
1975         if playlistend == -1:
1976             video_ids = video_ids[playliststart:]
1977         else:
1978             video_ids = video_ids[playliststart:playlistend]
1979
1980         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1981                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982
1983         for video_id in video_ids:
1984             self._downloader.download([u'http://blip.tv/'+video_id])
1985
1986
1987 class DepositFilesIE(InfoExtractor):
1988     """Information extractor for depositfiles.com"""
1989
1990     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991
1992     def report_download_webpage(self, file_id):
1993         """Report webpage download."""
1994         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995
1996     def report_extraction(self, file_id):
1997         """Report information extraction."""
1998         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999
2000     def _real_extract(self, url):
2001         file_id = url.split('/')[-1]
2002         # Rebuild url in english locale
2003         url = 'http://depositfiles.com/en/files/' + file_id
2004
2005         # Retrieve file webpage with 'Free download' button pressed
2006         free_download_indication = { 'gateway_result' : '1' }
2007         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008         try:
2009             self.report_download_webpage(file_id)
2010             webpage = compat_urllib_request.urlopen(request).read()
2011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2013             return
2014
2015         # Search for the real file URL
2016         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2017         if (mobj is None) or (mobj.group(1) is None):
2018             # Try to figure out reason of the error.
2019             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2020             if (mobj is not None) and (mobj.group(1) is not None):
2021                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2022                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023             else:
2024                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2025             return
2026
2027         file_url = mobj.group(1)
2028         file_extension = os.path.splitext(file_url)[1][1:]
2029
2030         # Search for file title
2031         mobj = re.search(r'<b title="(.*?)">', webpage)
2032         if mobj is None:
2033             self._downloader.trouble(u'ERROR: unable to extract title')
2034             return
2035         file_title = mobj.group(1).decode('utf-8')
2036
2037         return [{
2038             'id':       file_id.decode('utf-8'),
2039             'url':      file_url.decode('utf-8'),
2040             'uploader': None,
2041             'upload_date':  None,
2042             'title':    file_title,
2043             'ext':      file_extension.decode('utf-8'),
2044         }]
2045
2046
2047 class FacebookIE(InfoExtractor):
2048     """Information Extractor for Facebook"""
2049
2050     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2051     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2052     _NETRC_MACHINE = 'facebook'
2053     IE_NAME = u'facebook'
2054
2055     def report_login(self):
2056         """Report attempt to log in."""
2057         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058
2059     def _real_initialize(self):
2060         if self._downloader is None:
2061             return
2062
2063         useremail = None
2064         password = None
2065         downloader_params = self._downloader.params
2066
2067         # Attempt to use provided username and password or .netrc data
2068         if downloader_params.get('username', None) is not None:
2069             useremail = downloader_params['username']
2070             password = downloader_params['password']
2071         elif downloader_params.get('usenetrc', False):
2072             try:
2073                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2074                 if info is not None:
2075                     useremail = info[0]
2076                     password = info[2]
2077                 else:
2078                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2079             except (IOError, netrc.NetrcParseError) as err:
2080                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2081                 return
2082
2083         if useremail is None:
2084             return
2085
2086         # Log in
2087         login_form = {
2088             'email': useremail,
2089             'pass': password,
2090             'login': 'Log+In'
2091             }
2092         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2093         try:
2094             self.report_login()
2095             login_results = compat_urllib_request.urlopen(request).read()
2096             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2097                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098                 return
2099         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2100             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2101             return
2102
2103     def _real_extract(self, url):
2104         mobj = re.match(self._VALID_URL, url)
2105         if mobj is None:
2106             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107             return
2108         video_id = mobj.group('ID')
2109
2110         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2111         webpage = self._download_webpage(url, video_id)
2112
2113         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2114         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2115         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116         if not m:
2117             raise ExtractorError(u'Cannot parse data')
2118         data = dict(json.loads(m.group(1)))
2119         params_raw = compat_urllib_parse.unquote(data['params'])
2120         params = json.loads(params_raw)
2121         video_url = params['hd_src']
2122         if not video_url:
2123             video_url = params['sd_src']
2124         if not video_url:
2125             raise ExtractorError(u'Cannot find video URL')
2126         video_duration = int(params['video_duration'])
2127
2128         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129         if not m:
2130             raise ExtractorError(u'Cannot find title in webpage')
2131         video_title = unescapeHTML(m.group(1))
2132
2133         info = {
2134             'id': video_id,
2135             'title': video_title,
2136             'url': video_url,
2137             'ext': 'mp4',
2138             'duration': video_duration,
2139             'thumbnail': params['thumbnail_src'],
2140         }
2141         return [info]
2142
2143
2144 class BlipTVIE(InfoExtractor):
2145     """Information extractor for blip.tv"""
2146
2147     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149     IE_NAME = u'blip.tv'
2150
2151     def report_extraction(self, file_id):
2152         """Report information extraction."""
2153         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155     def report_direct_download(self, title):
2156         """Report information extraction."""
2157         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159     def _real_extract(self, url):
2160         mobj = re.match(self._VALID_URL, url)
2161         if mobj is None:
2162             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163             return
2164
2165         if '?' in url:
2166             cchar = '&'
2167         else:
2168             cchar = '?'
2169         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2170         request = compat_urllib_request.Request(json_url)
2171         request.add_header('User-Agent', 'iTunes/10.6.1')
2172         self.report_extraction(mobj.group(1))
2173         info = None
2174         try:
2175             urlh = compat_urllib_request.urlopen(request)
2176             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2177                 basename = url.split('/')[-1]
2178                 title,ext = os.path.splitext(basename)
2179                 title = title.decode('UTF-8')
2180                 ext = ext.replace('.', '')
2181                 self.report_direct_download(title)
2182                 info = {
2183                     'id': title,
2184                     'url': url,
2185                     'uploader': None,
2186                     'upload_date': None,
2187                     'title': title,
2188                     'ext': ext,
2189                     'urlhandle': urlh
2190                 }
2191         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2192             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2193         if info is None: # Regular URL
2194             try:
2195                 json_code_bytes = urlh.read()
2196                 json_code = json_code_bytes.decode('utf-8')
2197             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2198                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2199                 return
2200
2201             try:
2202                 json_data = json.loads(json_code)
2203                 if 'Post' in json_data:
2204                     data = json_data['Post']
2205                 else:
2206                     data = json_data
2207
2208                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2209                 video_url = data['media']['url']
2210                 umobj = re.match(self._URL_EXT, video_url)
2211                 if umobj is None:
2212                     raise ValueError('Can not determine filename extension')
2213                 ext = umobj.group(1)
2214
2215                 info = {
2216                     'id': data['item_id'],
2217                     'url': video_url,
2218                     'uploader': data['display_name'],
2219                     'upload_date': upload_date,
2220                     'title': data['title'],
2221                     'ext': ext,
2222                     'format': data['media']['mimeType'],
2223                     'thumbnail': data['thumbnailUrl'],
2224                     'description': data['description'],
2225                     'player_url': data['embedUrl'],
2226                     'user_agent': 'iTunes/10.6.1',
2227                 }
2228             except (ValueError,KeyError) as err:
2229                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2230                 return
2231
2232         return [info]
2233
2234
2235 class MyVideoIE(InfoExtractor):
2236     """Information Extractor for myvideo.de."""
2237
2238     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2239     IE_NAME = u'myvideo'
2240
2241     def __init__(self, downloader=None):
2242         InfoExtractor.__init__(self, downloader)
2243
2244     def report_extraction(self, video_id):
2245         """Report information extraction."""
2246         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2247
2248     def _real_extract(self,url):
2249         mobj = re.match(self._VALID_URL, url)
2250         if mobj is None:
2251             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2252             return
2253
2254         video_id = mobj.group(1)
2255
2256         # Get video webpage
2257         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2258         webpage = self._download_webpage(webpage_url, video_id)
2259
2260         self.report_extraction(video_id)
2261         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2262                  webpage)
2263         if mobj is None:
2264             self._downloader.trouble(u'ERROR: unable to extract media URL')
2265             return
2266         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2267
2268         mobj = re.search('<title>([^<]+)</title>', webpage)
2269         if mobj is None:
2270             self._downloader.trouble(u'ERROR: unable to extract title')
2271             return
2272
2273         video_title = mobj.group(1)
2274
2275         return [{
2276             'id':       video_id,
2277             'url':      video_url,
2278             'uploader': None,
2279             'upload_date':  None,
2280             'title':    video_title,
2281             'ext':      u'flv',
2282         }]
2283
2284 class ComedyCentralIE(InfoExtractor):
2285     """Information extractor for The Daily Show and Colbert Report """
2286
2287     # urls can be abbreviations like :thedailyshow or :colbert
2288     # urls for episodes like:
2289     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2290     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2291     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2292     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2293                       |(https?://)?(www\.)?
2294                           (?P<showname>thedailyshow|colbertnation)\.com/
2295                          (full-episodes/(?P<episode>.*)|
2296                           (?P<clip>
2297                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2298                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2299                      $"""
2300
2301     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2302
2303     _video_extensions = {
2304         '3500': 'mp4',
2305         '2200': 'mp4',
2306         '1700': 'mp4',
2307         '1200': 'mp4',
2308         '750': 'mp4',
2309         '400': 'mp4',
2310     }
2311     _video_dimensions = {
2312         '3500': '1280x720',
2313         '2200': '960x540',
2314         '1700': '768x432',
2315         '1200': '640x360',
2316         '750': '512x288',
2317         '400': '384x216',
2318     }
2319
2320     def suitable(self, url):
2321         """Receives a URL and returns True if suitable for this IE."""
2322         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2323
2324     def report_extraction(self, episode_id):
2325         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2326
2327     def report_config_download(self, episode_id, media_id):
2328         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2329
2330     def report_index_download(self, episode_id):
2331         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2332
2333     def _print_formats(self, formats):
2334         print('Available formats:')
2335         for x in formats:
2336             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2337
2338
2339     def _real_extract(self, url):
2340         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2341         if mobj is None:
2342             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2343             return
2344
2345         if mobj.group('shortname'):
2346             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2347                 url = u'http://www.thedailyshow.com/full-episodes/'
2348             else:
2349                 url = u'http://www.colbertnation.com/full-episodes/'
2350             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2351             assert mobj is not None
2352
2353         if mobj.group('clip'):
2354             if mobj.group('showname') == 'thedailyshow':
2355                 epTitle = mobj.group('tdstitle')
2356             else:
2357                 epTitle = mobj.group('cntitle')
2358             dlNewest = False
2359         else:
2360             dlNewest = not mobj.group('episode')
2361             if dlNewest:
2362                 epTitle = mobj.group('showname')
2363             else:
2364                 epTitle = mobj.group('episode')
2365
2366         req = compat_urllib_request.Request(url)
2367         self.report_extraction(epTitle)
2368         try:
2369             htmlHandle = compat_urllib_request.urlopen(req)
2370             html = htmlHandle.read()
2371             webpage = html.decode('utf-8')
2372         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2373             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2374             return
2375         if dlNewest:
2376             url = htmlHandle.geturl()
2377             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2378             if mobj is None:
2379                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2380                 return
2381             if mobj.group('episode') == '':
2382                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2383                 return
2384             epTitle = mobj.group('episode')
2385
2386         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2387
2388         if len(mMovieParams) == 0:
2389             # The Colbert Report embeds the information in a without
2390             # a URL prefix; so extract the alternate reference
2391             # and then add the URL prefix manually.
2392
2393             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2394             if len(altMovieParams) == 0:
2395                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2396                 return
2397             else:
2398                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2399
2400         uri = mMovieParams[0][1]
2401         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2402         self.report_index_download(epTitle)
2403         try:
2404             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2407             return
2408
2409         results = []
2410
2411         idoc = xml.etree.ElementTree.fromstring(indexXml)
2412         itemEls = idoc.findall('.//item')
2413         for partNum,itemEl in enumerate(itemEls):
2414             mediaId = itemEl.findall('./guid')[0].text
2415             shortMediaId = mediaId.split(':')[-1]
2416             showId = mediaId.split(':')[-2].replace('.com', '')
2417             officialTitle = itemEl.findall('./title')[0].text
2418             officialDate = itemEl.findall('./pubDate')[0].text
2419
2420             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2421                         compat_urllib_parse.urlencode({'uri': mediaId}))
2422             configReq = compat_urllib_request.Request(configUrl)
2423             self.report_config_download(epTitle, shortMediaId)
2424             try:
2425                 configXml = compat_urllib_request.urlopen(configReq).read()
2426             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2427                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2428                 return
2429
2430             cdoc = xml.etree.ElementTree.fromstring(configXml)
2431             turls = []
2432             for rendition in cdoc.findall('.//rendition'):
2433                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2434                 turls.append(finfo)
2435
2436             if len(turls) == 0:
2437                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2438                 continue
2439
2440             if self._downloader.params.get('listformats', None):
2441                 self._print_formats([i[0] for i in turls])
2442                 return
2443
2444             # For now, just pick the highest bitrate
2445             format,rtmp_video_url = turls[-1]
2446
2447             # Get the format arg from the arg stream
2448             req_format = self._downloader.params.get('format', None)
2449
2450             # Select format if we can find one
2451             for f,v in turls:
2452                 if f == req_format:
2453                     format, rtmp_video_url = f, v
2454                     break
2455
2456             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2457             if not m:
2458                 raise ExtractorError(u'Cannot transform RTMP url')
2459             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2460             video_url = base + m.group('finalid')
2461
2462             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2463             info = {
2464                 'id': shortMediaId,
2465                 'url': video_url,
2466                 'uploader': showId,
2467                 'upload_date': officialDate,
2468                 'title': effTitle,
2469                 'ext': 'mp4',
2470                 'format': format,
2471                 'thumbnail': None,
2472                 'description': officialTitle,
2473             }
2474             results.append(info)
2475
2476         return results
2477
2478
2479 class EscapistIE(InfoExtractor):
2480     """Information extractor for The Escapist """
2481
2482     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2483     IE_NAME = u'escapist'
2484
2485     def report_extraction(self, showName):
2486         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2487
2488     def report_config_download(self, showName):
2489         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2490
2491     def _real_extract(self, url):
2492         mobj = re.match(self._VALID_URL, url)
2493         if mobj is None:
2494             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2495             return
2496         showName = mobj.group('showname')
2497         videoId = mobj.group('episode')
2498
2499         self.report_extraction(showName)
2500         try:
2501             webPage = compat_urllib_request.urlopen(url)
2502             webPageBytes = webPage.read()
2503             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2504             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2505         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2507             return
2508
2509         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2510         description = unescapeHTML(descMatch.group(1))
2511         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2512         imgUrl = unescapeHTML(imgMatch.group(1))
2513         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2514         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2515         configUrlMatch = re.search('config=(.*)$', playerUrl)
2516         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2517
2518         self.report_config_download(showName)
2519         try:
2520             configJSON = compat_urllib_request.urlopen(configUrl)
2521             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2522             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2523         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2524             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2525             return
2526
2527         # Technically, it's JavaScript, not JSON
2528         configJSON = configJSON.replace("'", '"')
2529
2530         try:
2531             config = json.loads(configJSON)
2532         except (ValueError,) as err:
2533             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2534             return
2535
2536         playlist = config['playlist']
2537         videoUrl = playlist[1]['url']
2538
2539         info = {
2540             'id': videoId,
2541             'url': videoUrl,
2542             'uploader': showName,
2543             'upload_date': None,
2544             'title': showName,
2545             'ext': 'flv',
2546             'thumbnail': imgUrl,
2547             'description': description,
2548             'player_url': playerUrl,
2549         }
2550
2551         return [info]
2552
2553 class CollegeHumorIE(InfoExtractor):
2554     """Information extractor for collegehumor.com"""
2555
2556     _WORKING = False
2557     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2558     IE_NAME = u'collegehumor'
2559
2560     def report_manifest(self, video_id):
2561         """Report information extraction."""
2562         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2563
2564     def report_extraction(self, video_id):
2565         """Report information extraction."""
2566         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2567
2568     def _real_extract(self, url):
2569         mobj = re.match(self._VALID_URL, url)
2570         if mobj is None:
2571             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2572             return
2573         video_id = mobj.group('videoid')
2574
2575         info = {
2576             'id': video_id,
2577             'uploader': None,
2578             'upload_date': None,
2579         }
2580
2581         self.report_extraction(video_id)
2582         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2583         try:
2584             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2587             return
2588
2589         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2590         try:
2591             videoNode = mdoc.findall('./video')[0]
2592             info['description'] = videoNode.findall('./description')[0].text
2593             info['title'] = videoNode.findall('./caption')[0].text
2594             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2595             manifest_url = videoNode.findall('./file')[0].text
2596         except IndexError:
2597             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2598             return
2599
2600         manifest_url += '?hdcore=2.10.3'
2601         self.report_manifest(video_id)
2602         try:
2603             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2604         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2606             return
2607
2608         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2609         try:
2610             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2611             node_id = media_node.attrib['url']
2612             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2613         except IndexError as err:
2614             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2615             return
2616
2617         url_pr = compat_urllib_parse_urlparse(manifest_url)
2618         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2619
2620         info['url'] = url
2621         info['ext'] = 'f4f'
2622         return [info]
2623
2624
2625 class XVideosIE(InfoExtractor):
2626     """Information extractor for xvideos.com"""
2627
2628     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2629     IE_NAME = u'xvideos'
2630
2631     def report_extraction(self, video_id):
2632         """Report information extraction."""
2633         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2634
2635     def _real_extract(self, url):
2636         mobj = re.match(self._VALID_URL, url)
2637         if mobj is None:
2638             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639             return
2640         video_id = mobj.group(1)
2641
2642         webpage = self._download_webpage(url, video_id)
2643
2644         self.report_extraction(video_id)
2645
2646
2647         # Extract video URL
2648         mobj = re.search(r'flv_url=(.+?)&', webpage)
2649         if mobj is None:
2650             self._downloader.trouble(u'ERROR: unable to extract video url')
2651             return
2652         video_url = compat_urllib_parse.unquote(mobj.group(1))
2653
2654
2655         # Extract title
2656         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2657         if mobj is None:
2658             self._downloader.trouble(u'ERROR: unable to extract video title')
2659             return
2660         video_title = mobj.group(1)
2661
2662
2663         # Extract video thumbnail
2664         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2665         if mobj is None:
2666             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2667             return
2668         video_thumbnail = mobj.group(0)
2669
2670         info = {
2671             'id': video_id,
2672             'url': video_url,
2673             'uploader': None,
2674             'upload_date': None,
2675             'title': video_title,
2676             'ext': 'flv',
2677             'thumbnail': video_thumbnail,
2678             'description': None,
2679         }
2680
2681         return [info]
2682
2683
2684 class SoundcloudIE(InfoExtractor):
2685     """Information extractor for soundcloud.com
2686        To access the media, the uid of the song and a stream token
2687        must be extracted from the page source and the script must make
2688        a request to media.soundcloud.com/crossdomain.xml. Then
2689        the media can be grabbed by requesting from an url composed
2690        of the stream token and uid
2691      """
2692
2693     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2694     IE_NAME = u'soundcloud'
2695
2696     def __init__(self, downloader=None):
2697         InfoExtractor.__init__(self, downloader)
2698
2699     def report_resolve(self, video_id):
2700         """Report information extraction."""
2701         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2702
2703     def report_extraction(self, video_id):
2704         """Report information extraction."""
2705         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2706
2707     def _real_extract(self, url):
2708         mobj = re.match(self._VALID_URL, url)
2709         if mobj is None:
2710             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2711             return
2712
2713         # extract uploader (which is in the url)
2714         uploader = mobj.group(1)
2715         # extract simple title (uploader + slug of song title)
2716         slug_title =  mobj.group(2)
2717         simple_title = uploader + u'-' + slug_title
2718
2719         self.report_resolve('%s/%s' % (uploader, slug_title))
2720
2721         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2722         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2723         request = compat_urllib_request.Request(resolv_url)
2724         try:
2725             info_json_bytes = compat_urllib_request.urlopen(request).read()
2726             info_json = info_json_bytes.decode('utf-8')
2727         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2728             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2729             return
2730
2731         info = json.loads(info_json)
2732         video_id = info['id']
2733         self.report_extraction('%s/%s' % (uploader, slug_title))
2734
2735         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736         request = compat_urllib_request.Request(streams_url)
2737         try:
2738             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2739             stream_json = stream_json_bytes.decode('utf-8')
2740         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2742             return
2743
2744         streams = json.loads(stream_json)
2745         mediaURL = streams['http_mp3_128_url']
2746
2747         return [{
2748             'id':       info['id'],
2749             'url':      mediaURL,
2750             'uploader': info['user']['username'],
2751             'upload_date':  info['created_at'],
2752             'title':    info['title'],
2753             'ext':      u'mp3',
2754             'description': info['description'],
2755         }]
2756
2757
2758 class InfoQIE(InfoExtractor):
2759     """Information extractor for infoq.com"""
2760     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2761
2762     def report_extraction(self, video_id):
2763         """Report information extraction."""
2764         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2765
2766     def _real_extract(self, url):
2767         mobj = re.match(self._VALID_URL, url)
2768         if mobj is None:
2769             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2770             return
2771
2772         webpage = self._download_webpage(url, video_id=url)
2773         self.report_extraction(url)
2774
2775         # Extract video URL
2776         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2777         if mobj is None:
2778             self._downloader.trouble(u'ERROR: unable to extract video url')
2779             return
2780         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2781         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2782
2783         # Extract title
2784         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2785         if mobj is None:
2786             self._downloader.trouble(u'ERROR: unable to extract video title')
2787             return
2788         video_title = mobj.group(1)
2789
2790         # Extract description
2791         video_description = u'No description available.'
2792         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2793         if mobj is not None:
2794             video_description = mobj.group(1)
2795
2796         video_filename = video_url.split('/')[-1]
2797         video_id, extension = video_filename.split('.')
2798
2799         info = {
2800             'id': video_id,
2801             'url': video_url,
2802             'uploader': None,
2803             'upload_date': None,
2804             'title': video_title,
2805             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2806             'thumbnail': None,
2807             'description': video_description,
2808         }
2809
2810         return [info]
2811
2812 class MixcloudIE(InfoExtractor):
2813     """Information extractor for www.mixcloud.com"""
2814
2815     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2816     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2817     IE_NAME = u'mixcloud'
2818
2819     def __init__(self, downloader=None):
2820         InfoExtractor.__init__(self, downloader)
2821
2822     def report_download_json(self, file_id):
2823         """Report JSON download."""
2824         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2825
2826     def report_extraction(self, file_id):
2827         """Report information extraction."""
2828         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2829
2830     def get_urls(self, jsonData, fmt, bitrate='best'):
2831         """Get urls from 'audio_formats' section in json"""
2832         file_url = None
2833         try:
2834             bitrate_list = jsonData[fmt]
2835             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2836                 bitrate = max(bitrate_list) # select highest
2837
2838             url_list = jsonData[fmt][bitrate]
2839         except TypeError: # we have no bitrate info.
2840             url_list = jsonData[fmt]
2841         return url_list
2842
2843     def check_urls(self, url_list):
2844         """Returns 1st active url from list"""
2845         for url in url_list:
2846             try:
2847                 compat_urllib_request.urlopen(url)
2848                 return url
2849             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2850                 url = None
2851
2852         return None
2853
2854     def _print_formats(self, formats):
2855         print('Available formats:')
2856         for fmt in formats.keys():
2857             for b in formats[fmt]:
2858                 try:
2859                     ext = formats[fmt][b][0]
2860                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2861                 except TypeError: # we have no bitrate info
2862                     ext = formats[fmt][0]
2863                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2864                     break
2865
2866     def _real_extract(self, url):
2867         mobj = re.match(self._VALID_URL, url)
2868         if mobj is None:
2869             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2870             return
2871         # extract uploader & filename from url
2872         uploader = mobj.group(1).decode('utf-8')
2873         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2874
2875         # construct API request
2876         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2877         # retrieve .json file with links to files
2878         request = compat_urllib_request.Request(file_url)
2879         try:
2880             self.report_download_json(file_url)
2881             jsonData = compat_urllib_request.urlopen(request).read()
2882         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2883             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2884             return
2885
2886         # parse JSON
2887         json_data = json.loads(jsonData)
2888         player_url = json_data['player_swf_url']
2889         formats = dict(json_data['audio_formats'])
2890
2891         req_format = self._downloader.params.get('format', None)
2892         bitrate = None
2893
2894         if self._downloader.params.get('listformats', None):
2895             self._print_formats(formats)
2896             return
2897
2898         if req_format is None or req_format == 'best':
2899             for format_param in formats.keys():
2900                 url_list = self.get_urls(formats, format_param)
2901                 # check urls
2902                 file_url = self.check_urls(url_list)
2903                 if file_url is not None:
2904                     break # got it!
2905         else:
2906             if req_format not in formats:
2907                 self._downloader.trouble(u'ERROR: format is not available')
2908                 return
2909
2910             url_list = self.get_urls(formats, req_format)
2911             file_url = self.check_urls(url_list)
2912             format_param = req_format
2913
2914         return [{
2915             'id': file_id.decode('utf-8'),
2916             'url': file_url.decode('utf-8'),
2917             'uploader': uploader.decode('utf-8'),
2918             'upload_date': None,
2919             'title': json_data['name'],
2920             'ext': file_url.split('.')[-1].decode('utf-8'),
2921             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2922             'thumbnail': json_data['thumbnail_url'],
2923             'description': json_data['description'],
2924             'player_url': player_url.decode('utf-8'),
2925         }]
2926
2927 class StanfordOpenClassroomIE(InfoExtractor):
2928     """Information extractor for Stanford's Open ClassRoom"""
2929
2930     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2931     IE_NAME = u'stanfordoc'
2932
2933     def report_download_webpage(self, objid):
2934         """Report information extraction."""
2935         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2936
2937     def report_extraction(self, video_id):
2938         """Report information extraction."""
2939         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2940
2941     def _real_extract(self, url):
2942         mobj = re.match(self._VALID_URL, url)
2943         if mobj is None:
2944             raise ExtractorError(u'Invalid URL: %s' % url)
2945
2946         if mobj.group('course') and mobj.group('video'): # A specific video
2947             course = mobj.group('course')
2948             video = mobj.group('video')
2949             info = {
2950                 'id': course + '_' + video,
2951                 'uploader': None,
2952                 'upload_date': None,
2953             }
2954
2955             self.report_extraction(info['id'])
2956             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2957             xmlUrl = baseUrl + video + '.xml'
2958             try:
2959                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2960             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2961                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2962                 return
2963             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2964             try:
2965                 info['title'] = mdoc.findall('./title')[0].text
2966                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2967             except IndexError:
2968                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2969                 return
2970             info['ext'] = info['url'].rpartition('.')[2]
2971             return [info]
2972         elif mobj.group('course'): # A course page
2973             course = mobj.group('course')
2974             info = {
2975                 'id': course,
2976                 'type': 'playlist',
2977                 'uploader': None,
2978                 'upload_date': None,
2979             }
2980
2981             coursepage = self._download_webpage(url, info['id'],
2982                                         note='Downloading course info page',
2983                                         errnote='Unable to download course info page')
2984
2985             m = re.search('<h1>([^<]+)</h1>', coursepage)
2986             if m:
2987                 info['title'] = unescapeHTML(m.group(1))
2988             else:
2989                 info['title'] = info['id']
2990
2991             m = re.search('<description>([^<]+)</description>', coursepage)
2992             if m:
2993                 info['description'] = unescapeHTML(m.group(1))
2994
2995             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2996             info['list'] = [
2997                 {
2998                     'type': 'reference',
2999                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3000                 }
3001                     for vpage in links]
3002             results = []
3003             for entry in info['list']:
3004                 assert entry['type'] == 'reference'
3005                 results += self.extract(entry['url'])
3006             return results
3007         else: # Root page
3008             info = {
3009                 'id': 'Stanford OpenClassroom',
3010                 'type': 'playlist',
3011                 'uploader': None,
3012                 'upload_date': None,
3013             }
3014
3015             self.report_download_webpage(info['id'])
3016             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3017             try:
3018                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3019             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3021                 return
3022
3023             info['title'] = info['id']
3024
3025             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3026             info['list'] = [
3027                 {
3028                     'type': 'reference',
3029                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3030                 }
3031                     for cpage in links]
3032
3033             results = []
3034             for entry in info['list']:
3035                 assert entry['type'] == 'reference'
3036                 results += self.extract(entry['url'])
3037             return results
3038
3039 class MTVIE(InfoExtractor):
3040     """Information extractor for MTV.com"""
3041
3042     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3043     IE_NAME = u'mtv'
3044
3045     def report_extraction(self, video_id):
3046         """Report information extraction."""
3047         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3048
3049     def _real_extract(self, url):
3050         mobj = re.match(self._VALID_URL, url)
3051         if mobj is None:
3052             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3053             return
3054         if not mobj.group('proto'):
3055             url = 'http://' + url
3056         video_id = mobj.group('videoid')
3057
3058         webpage = self._download_webpage(url, video_id)
3059
3060         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3061         if mobj is None:
3062             self._downloader.trouble(u'ERROR: unable to extract song name')
3063             return
3064         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3065         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3066         if mobj is None:
3067             self._downloader.trouble(u'ERROR: unable to extract performer')
3068             return
3069         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3070         video_title = performer + ' - ' + song_name
3071
3072         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3073         if mobj is None:
3074             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3075             return
3076         mtvn_uri = mobj.group(1)
3077
3078         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3079         if mobj is None:
3080             self._downloader.trouble(u'ERROR: unable to extract content id')
3081             return
3082         content_id = mobj.group(1)
3083
3084         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3085         self.report_extraction(video_id)
3086         request = compat_urllib_request.Request(videogen_url)
3087         try:
3088             metadataXml = compat_urllib_request.urlopen(request).read()
3089         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3090             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3091             return
3092
3093         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3094         renditions = mdoc.findall('.//rendition')
3095
3096         # For now, always pick the highest quality.
3097         rendition = renditions[-1]
3098
3099         try:
3100             _,_,ext = rendition.attrib['type'].partition('/')
3101             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3102             video_url = rendition.find('./src').text
3103         except KeyError:
3104             self._downloader.trouble('Invalid rendition field.')
3105             return
3106
3107         info = {
3108             'id': video_id,
3109             'url': video_url,
3110             'uploader': performer,
3111             'upload_date': None,
3112             'title': video_title,
3113             'ext': ext,
3114             'format': format,
3115         }
3116
3117         return [info]
3118
3119
3120 class YoukuIE(InfoExtractor):
3121     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3122
3123     def report_download_webpage(self, file_id):
3124         """Report webpage download."""
3125         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3126
3127     def report_extraction(self, file_id):
3128         """Report information extraction."""
3129         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3130
3131     def _gen_sid(self):
3132         nowTime = int(time.time() * 1000)
3133         random1 = random.randint(1000,1998)
3134         random2 = random.randint(1000,9999)
3135
3136         return "%d%d%d" %(nowTime,random1,random2)
3137
3138     def _get_file_ID_mix_string(self, seed):
3139         mixed = []
3140         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3141         seed = float(seed)
3142         for i in range(len(source)):
3143             seed  =  (seed * 211 + 30031 ) % 65536
3144             index  =  math.floor(seed / 65536 * len(source) )
3145             mixed.append(source[int(index)])
3146             source.remove(source[int(index)])
3147         #return ''.join(mixed)
3148         return mixed
3149
3150     def _get_file_id(self, fileId, seed):
3151         mixed = self._get_file_ID_mix_string(seed)
3152         ids = fileId.split('*')
3153         realId = []
3154         for ch in ids:
3155             if ch:
3156                 realId.append(mixed[int(ch)])
3157         return ''.join(realId)
3158
3159     def _real_extract(self, url):
3160         mobj = re.match(self._VALID_URL, url)
3161         if mobj is None:
3162             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3163             return
3164         video_id = mobj.group('ID')
3165
3166         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3167
3168         request = compat_urllib_request.Request(info_url, None, std_headers)
3169         try:
3170             self.report_download_webpage(video_id)
3171             jsondata = compat_urllib_request.urlopen(request).read()
3172         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3173             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3174             return
3175
3176         self.report_extraction(video_id)
3177         try:
3178             jsonstr = jsondata.decode('utf-8')
3179             config = json.loads(jsonstr)
3180
3181             video_title =  config['data'][0]['title']
3182             seed = config['data'][0]['seed']
3183
3184             format = self._downloader.params.get('format', None)
3185             supported_format = list(config['data'][0]['streamfileids'].keys())
3186
3187             if format is None or format == 'best':
3188                 if 'hd2' in supported_format:
3189                     format = 'hd2'
3190                 else:
3191                     format = 'flv'
3192                 ext = u'flv'
3193             elif format == 'worst':
3194                 format = 'mp4'
3195                 ext = u'mp4'
3196             else:
3197                 format = 'flv'
3198                 ext = u'flv'
3199
3200
3201             fileid = config['data'][0]['streamfileids'][format]
3202             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3203         except (UnicodeDecodeError, ValueError, KeyError):
3204             self._downloader.trouble(u'ERROR: unable to extract info section')
3205             return
3206
3207         files_info=[]
3208         sid = self._gen_sid()
3209         fileid = self._get_file_id(fileid, seed)
3210
3211         #column 8,9 of fileid represent the segment number
3212         #fileid[7:9] should be changed
3213         for index, key in enumerate(keys):
3214
3215             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3216             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3217
3218             info = {
3219                 'id': '%s_part%02d' % (video_id, index),
3220                 'url': download_url,
3221                 'uploader': None,
3222                 'upload_date': None,
3223                 'title': video_title,
3224                 'ext': ext,
3225             }
3226             files_info.append(info)
3227
3228         return files_info
3229
3230
3231 class XNXXIE(InfoExtractor):
3232     """Information extractor for xnxx.com"""
3233
3234     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3235     IE_NAME = u'xnxx'
3236     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3237     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3238     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3239
3240     def report_webpage(self, video_id):
3241         """Report information extraction"""
3242         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3243
3244     def report_extraction(self, video_id):
3245         """Report information extraction"""
3246         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3247
3248     def _real_extract(self, url):
3249         mobj = re.match(self._VALID_URL, url)
3250         if mobj is None:
3251             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3252             return
3253         video_id = mobj.group(1)
3254
3255         self.report_webpage(video_id)
3256
3257         # Get webpage content
3258         try:
3259             webpage_bytes = compat_urllib_request.urlopen(url).read()
3260             webpage = webpage_bytes.decode('utf-8')
3261         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3262             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3263             return
3264
3265         result = re.search(self.VIDEO_URL_RE, webpage)
3266         if result is None:
3267             self._downloader.trouble(u'ERROR: unable to extract video url')
3268             return
3269         video_url = compat_urllib_parse.unquote(result.group(1))
3270
3271         result = re.search(self.VIDEO_TITLE_RE, webpage)
3272         if result is None:
3273             self._downloader.trouble(u'ERROR: unable to extract video title')
3274             return
3275         video_title = result.group(1)
3276
3277         result = re.search(self.VIDEO_THUMB_RE, webpage)
3278         if result is None:
3279             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3280             return
3281         video_thumbnail = result.group(1)
3282
3283         return [{
3284             'id': video_id,
3285             'url': video_url,
3286             'uploader': None,
3287             'upload_date': None,
3288             'title': video_title,
3289             'ext': 'flv',
3290             'thumbnail': video_thumbnail,
3291             'description': None,
3292         }]
3293
3294
3295 class GooglePlusIE(InfoExtractor):
3296     """Information extractor for plus.google.com."""
3297
3298     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3299     IE_NAME = u'plus.google'
3300
3301     def __init__(self, downloader=None):
3302         InfoExtractor.__init__(self, downloader)
3303
3304     def report_extract_entry(self, url):
3305         """Report downloading extry"""
3306         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3307
3308     def report_date(self, upload_date):
3309         """Report downloading extry"""
3310         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3311
3312     def report_uploader(self, uploader):
3313         """Report downloading extry"""
3314         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3315
3316     def report_title(self, video_title):
3317         """Report downloading extry"""
3318         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3319
3320     def report_extract_vid_page(self, video_page):
3321         """Report information extraction."""
3322         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3323
3324     def _real_extract(self, url):
3325         # Extract id from URL
3326         mobj = re.match(self._VALID_URL, url)
3327         if mobj is None:
3328             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3329             return
3330
3331         post_url = mobj.group(0)
3332         video_id = mobj.group(1)
3333
3334         video_extension = 'flv'
3335
3336         # Step 1, Retrieve post webpage to extract further information
3337         self.report_extract_entry(post_url)
3338         request = compat_urllib_request.Request(post_url)
3339         try:
3340             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3341         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3342             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3343             return
3344
3345         # Extract update date
3346         upload_date = None
3347         pattern = 'title="Timestamp">(.*?)</a>'
3348         mobj = re.search(pattern, webpage)
3349         if mobj:
3350             upload_date = mobj.group(1)
3351             # Convert timestring to a format suitable for filename
3352             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3353             upload_date = upload_date.strftime('%Y%m%d')
3354         self.report_date(upload_date)
3355
3356         # Extract uploader
3357         uploader = None
3358         pattern = r'rel\="author".*?>(.*?)</a>'
3359         mobj = re.search(pattern, webpage)
3360         if mobj:
3361             uploader = mobj.group(1)
3362         self.report_uploader(uploader)
3363
3364         # Extract title
3365         # Get the first line for title
3366         video_title = u'NA'
3367         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3368         mobj = re.search(pattern, webpage)
3369         if mobj:
3370             video_title = mobj.group(1)
3371         self.report_title(video_title)
3372
3373         # Step 2, Stimulate clicking the image box to launch video
3374         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3375         mobj = re.search(pattern, webpage)
3376         if mobj is None:
3377             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3378
3379         video_page = mobj.group(1)
3380         request = compat_urllib_request.Request(video_page)
3381         try:
3382             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3383         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3384             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3385             return
3386         self.report_extract_vid_page(video_page)
3387
3388
3389         # Extract video links on video page
3390         """Extract video links of all sizes"""
3391         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3392         mobj = re.findall(pattern, webpage)
3393         if len(mobj) == 0:
3394             self._downloader.trouble(u'ERROR: unable to extract video links')
3395
3396         # Sort in resolution
3397         links = sorted(mobj)
3398
3399         # Choose the lowest of the sort, i.e. highest resolution
3400         video_url = links[-1]
3401         # Only get the url. The resolution part in the tuple has no use anymore
3402         video_url = video_url[-1]
3403         # Treat escaped \u0026 style hex
3404         try:
3405             video_url = video_url.decode("unicode_escape")
3406         except AttributeError: # Python 3
3407             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3408
3409
3410         return [{
3411             'id':       video_id,
3412             'url':      video_url,
3413             'uploader': uploader,
3414             'upload_date':  upload_date,
3415             'title':    video_title,
3416             'ext':      video_extension,
3417         }]
3418
3419 class NBAIE(InfoExtractor):
3420     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3421     IE_NAME = u'nba'
3422
3423     def _real_extract(self, url):
3424         mobj = re.match(self._VALID_URL, url)
3425         if mobj is None:
3426             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3427             return
3428
3429         video_id = mobj.group(1)
3430         if video_id.endswith('/index.html'):
3431             video_id = video_id[:-len('/index.html')]
3432
3433         webpage = self._download_webpage(url, video_id)
3434
3435         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3436         def _findProp(rexp, default=None):
3437             m = re.search(rexp, webpage)
3438             if m:
3439                 return unescapeHTML(m.group(1))
3440             else:
3441                 return default
3442
3443         shortened_video_id = video_id.rpartition('/')[2]
3444         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3445         info = {
3446             'id': shortened_video_id,
3447             'url': video_url,
3448             'ext': 'mp4',
3449             'title': title,
3450             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3451             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3452         }
3453         return [info]
3454
3455 class JustinTVIE(InfoExtractor):
3456     """Information extractor for justin.tv and twitch.tv"""
3457     # TODO: One broadcast may be split into multiple videos. The key
3458     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3459     # starts at 1 and increases. Can we treat all parts as one video?
3460
3461     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3462         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3463     _JUSTIN_PAGE_LIMIT = 100
3464     IE_NAME = u'justin.tv'
3465
3466     def report_extraction(self, file_id):
3467         """Report information extraction."""
3468         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3469
3470     def report_download_page(self, channel, offset):
3471         """Report attempt to download a single page of videos."""
3472         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3473                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3474
3475     # Return count of items, list of *valid* items
3476     def _parse_page(self, url):
3477         try:
3478             urlh = compat_urllib_request.urlopen(url)
3479             webpage_bytes = urlh.read()
3480             webpage = webpage_bytes.decode('utf-8', 'ignore')
3481         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3482             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3483             return
3484
3485         response = json.loads(webpage)
3486         if type(response) != list:
3487             error_text = response.get('error', 'unknown error')
3488             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3489             return
3490         info = []
3491         for clip in response:
3492             video_url = clip['video_file_url']
3493             if video_url:
3494                 video_extension = os.path.splitext(video_url)[1][1:]
3495                 video_date = re.sub('-', '', clip['start_time'][:10])
3496                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3497                 video_id = clip['id']
3498                 video_title = clip.get('title', video_id)
3499                 info.append({
3500                     'id': video_id,
3501                     'url': video_url,
3502                     'title': video_title,
3503                     'uploader': clip.get('channel_name', video_uploader_id),
3504                     'uploader_id': video_uploader_id,
3505                     'upload_date': video_date,
3506                     'ext': video_extension,
3507                 })
3508         return (len(response), info)
3509
3510     def _real_extract(self, url):
3511         mobj = re.match(self._VALID_URL, url)
3512         if mobj is None:
3513             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3514             return
3515
3516         api = 'http://api.justin.tv'
3517         video_id = mobj.group(mobj.lastindex)
3518         paged = False
3519         if mobj.lastindex == 1:
3520             paged = True
3521             api += '/channel/archives/%s.json'
3522         else:
3523             api += '/broadcast/by_archive/%s.json'
3524         api = api % (video_id,)
3525
3526         self.report_extraction(video_id)
3527
3528         info = []
3529         offset = 0
3530         limit = self._JUSTIN_PAGE_LIMIT
3531         while True:
3532             if paged:
3533                 self.report_download_page(video_id, offset)
3534             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3535             page_count, page_info = self._parse_page(page_url)
3536             info.extend(page_info)
3537             if not paged or page_count != limit:
3538                 break
3539             offset += limit
3540         return info
3541
3542 class FunnyOrDieIE(InfoExtractor):
3543     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3544
3545     def _real_extract(self, url):
3546         mobj = re.match(self._VALID_URL, url)
3547         if mobj is None:
3548             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3549             return
3550
3551         video_id = mobj.group('id')
3552         webpage = self._download_webpage(url, video_id)
3553
3554         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3555         if not m:
3556             self._downloader.trouble(u'ERROR: unable to find video information')
3557         video_url = unescapeHTML(m.group('url'))
3558
3559         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3560         if not m:
3561             self._downloader.trouble(u'Cannot find video title')
3562         title = unescapeHTML(m.group('title'))
3563
3564         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3565         if m:
3566             desc = unescapeHTML(m.group('desc'))
3567         else:
3568             desc = None
3569
3570         info = {
3571             'id': video_id,
3572             'url': video_url,
3573             'ext': 'mp4',
3574             'title': title,
3575             'description': desc,
3576         }
3577         return [info]
3578
3579 class TweetReelIE(InfoExtractor):
3580     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3581
3582     def _real_extract(self, url):
3583         mobj = re.match(self._VALID_URL, url)
3584         if mobj is None:
3585             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3586             return
3587
3588         video_id = mobj.group('id')
3589         webpage = self._download_webpage(url, video_id)
3590
3591         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3592         if not m:
3593             self._downloader.trouble(u'ERROR: Cannot find status ID')
3594         status_id = m.group(1)
3595
3596         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3597         if not m:
3598             self._downloader.trouble(u'WARNING: Cannot find description')
3599         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3600
3601         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3602         if not m:
3603             self._downloader.trouble(u'ERROR: Cannot find uploader')
3604         uploader = unescapeHTML(m.group('uploader'))
3605         uploader_id = unescapeHTML(m.group('uploader_id'))
3606
3607         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3608         if not m:
3609             self._downloader.trouble(u'ERROR: Cannot find upload date')
3610         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3611
3612         title = desc
3613         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3614
3615         info = {
3616             'id': video_id,
3617             'url': video_url,
3618             'ext': 'mov',
3619             'title': title,
3620             'description': desc,
3621             'uploader': uploader,
3622             'uploader_id': uploader_id,
3623             'internal_id': status_id,
3624             'upload_date': upload_date
3625         }
3626         return [info]
3627         
3628 class SteamIE(InfoExtractor):
3629     _VALID_URL = r"""http://store.steampowered.com/ 
3630                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3631                 (?P<gameID>\d+)/?
3632                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3633                 """
3634
3635     def suitable(self, url):
3636         """Receives a URL and returns True if suitable for this IE."""
3637         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3638
3639     def _real_extract(self, url):
3640         m = re.match(self._VALID_URL, url, re.VERBOSE)
3641         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3642         gameID = m.group('gameID')
3643         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3644         webpage = self._download_webpage(videourl, gameID)
3645         mweb = re.finditer(urlRE, webpage)
3646         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3647         titles = re.finditer(namesRE, webpage)
3648         videos = []
3649         for vid,vtitle in zip(mweb,titles):
3650             video_id = vid.group('videoID')
3651             title = vtitle.group('videoName')
3652             video_url = vid.group('videoURL')
3653             if not video_url:
3654                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3655             info = {
3656                 'id':video_id,
3657                 'url':video_url,
3658                 'ext': 'flv',
3659                 'title': unescapeHTML(title)
3660                   }
3661             videos.append(info)
3662         return videos
3663
3664 class UstreamIE(InfoExtractor):
3665     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3666     IE_NAME = u'ustream'
3667
3668     def _real_extract(self, url):
3669         m = re.match(self._VALID_URL, url)
3670         video_id = m.group('videoID')
3671         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3672         webpage = self._download_webpage(url, video_id)
3673         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3674         title = m.group('title')
3675         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3676         uploader = m.group('uploader')
3677         info = {
3678                 'id':video_id,
3679                 'url':video_url,
3680                 'ext': 'flv',
3681                 'title': title,
3682                 'uploader': uploader
3683                   }
3684         return [info]
3685
3686 class RBMARadioIE(InfoExtractor):
3687     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3688
3689     def _real_extract(self, url):
3690         m = re.match(self._VALID_URL, url)
3691         video_id = m.group('videoID')
3692
3693         webpage = self._download_webpage(url, video_id)
3694         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3695         if not m:
3696             raise ExtractorError(u'Cannot find metadata')
3697         json_data = m.group(1)
3698
3699         try:
3700             data = json.loads(json_data)
3701         except ValueError as e:
3702             raise ExtractorError(u'Invalid JSON: ' + str(e))
3703
3704         video_url = data['akamai_url'] + '&cbr=256'
3705         url_parts = compat_urllib_parse_urlparse(video_url)
3706         video_ext = url_parts.path.rpartition('.')[2]
3707         info = {
3708                 'id': video_id,
3709                 'url': video_url,
3710                 'ext': video_ext,
3711                 'title': data['title'],
3712                 'description': data.get('teaser_text'),
3713                 'location': data.get('country_of_origin'),
3714                 'uploader': data.get('host', {}).get('name'),
3715                 'uploader_id': data.get('host', {}).get('slug'),
3716                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3717                 'duration': data.get('duration'),
3718         }
3719         return [info]
3720
3721
3722 class YouPornIE(InfoExtractor):
3723     """Information extractor for youporn.com."""
3724     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3725    
3726     def _print_formats(self, formats):
3727         """Print all available formats"""
3728         print(u'Available formats:')
3729         print(u'ext\t\tformat')
3730         print(u'---------------------------------')
3731         for format in formats:
3732             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3733
3734     def _specific(self, req_format, formats):
3735         for x in formats:
3736             if(x["format"]==req_format):
3737                 return x
3738         return None
3739
3740     def _real_extract(self, url):
3741         mobj = re.match(self._VALID_URL, url)
3742         if mobj is None:
3743             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3744             return
3745
3746         video_id = mobj.group('videoid')
3747
3748         req = compat_urllib_request.Request(url)
3749         req.add_header('Cookie', 'age_verified=1')
3750         webpage = self._download_webpage(req, video_id)
3751
3752         # Get the video title
3753         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3754         if result is None:
3755             raise ExtractorError(u'Unable to extract video title')
3756         video_title = result.group('title').strip()
3757
3758         # Get the video date
3759         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3760         if result is None:
3761             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3762             upload_date = None
3763         else:
3764             upload_date = result.group('date').strip()
3765
3766         # Get the video uploader
3767         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3768         if result is None:
3769             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3770             video_uploader = None
3771         else:
3772             video_uploader = result.group('uploader').strip()
3773             video_uploader = clean_html( video_uploader )
3774
3775         # Get all of the formats available
3776         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3777         result = re.search(DOWNLOAD_LIST_RE, webpage)
3778         if result is None:
3779             raise ExtractorError(u'Unable to extract download list')
3780         download_list_html = result.group('download_list').strip()
3781
3782         # Get all of the links from the page
3783         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3784         links = re.findall(LINK_RE, download_list_html)
3785         if(len(links) == 0):
3786             raise ExtractorError(u'ERROR: no known formats available for video')
3787         
3788         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3789
3790         formats = []
3791         for link in links:
3792
3793             # A link looks like this:
3794             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3795             # A path looks like this:
3796             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3797             video_url = unescapeHTML( link )
3798             path = compat_urllib_parse_urlparse( video_url ).path
3799             extension = os.path.splitext( path )[1][1:]
3800             format = path.split('/')[4].split('_')[:2]
3801             size = format[0]
3802             bitrate = format[1]
3803             format = "-".join( format )
3804             title = u'%s-%s-%s' % (video_title, size, bitrate)
3805
3806             formats.append({
3807                 'id': video_id,
3808                 'url': video_url,
3809                 'uploader': video_uploader,
3810                 'upload_date': upload_date,
3811                 'title': title,
3812                 'ext': extension,
3813                 'format': format,
3814                 'thumbnail': None,
3815                 'description': None,
3816                 'player_url': None
3817             })
3818
3819         if self._downloader.params.get('listformats', None):
3820             self._print_formats(formats)
3821             return
3822
3823         req_format = self._downloader.params.get('format', None)
3824         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3825
3826         if req_format is None or req_format == 'best':
3827             return [formats[0]]
3828         elif req_format == 'worst':
3829             return [formats[-1]]
3830         elif req_format in ('-1', 'all'):
3831             return formats
3832         else:
3833             format = self._specific( req_format, formats )
3834             if result is None:
3835                 self._downloader.trouble(u'ERROR: requested format not available')
3836                 return
3837             return [format]
3838
3839         
3840
3841 class PornotubeIE(InfoExtractor):
3842     """Information extractor for pornotube.com."""
3843     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3844
3845     def _real_extract(self, url):
3846         mobj = re.match(self._VALID_URL, url)
3847         if mobj is None:
3848             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3849             return
3850
3851         video_id = mobj.group('videoid')
3852         video_title = mobj.group('title')
3853
3854         # Get webpage content
3855         webpage = self._download_webpage(url, video_id)
3856
3857         # Get the video URL
3858         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3859         result = re.search(VIDEO_URL_RE, webpage)
3860         if result is None:
3861             self._downloader.trouble(u'ERROR: unable to extract video url')
3862             return
3863         video_url = compat_urllib_parse.unquote(result.group('url'))
3864
3865         #Get the uploaded date
3866         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3867         result = re.search(VIDEO_UPLOADED_RE, webpage)
3868         if result is None:
3869             self._downloader.trouble(u'ERROR: unable to extract video title')
3870             return
3871         upload_date = result.group('date')
3872
3873         info = {'id': video_id,
3874                 'url': video_url,
3875                 'uploader': None,
3876                 'upload_date': upload_date,
3877                 'title': video_title,
3878                 'ext': 'flv',
3879                 'format': 'flv'}
3880
3881         return [info]
3882
3883 class YouJizzIE(InfoExtractor):
3884     """Information extractor for youjizz.com."""
3885     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3886
3887     def _real_extract(self, url):
3888         mobj = re.match(self._VALID_URL, url)
3889         if mobj is None:
3890             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3891             return
3892
3893         video_id = mobj.group('videoid')
3894
3895         # Get webpage content
3896         webpage = self._download_webpage(url, video_id)
3897
3898         # Get the video title
3899         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3900         if result is None:
3901             raise ExtractorError(u'ERROR: unable to extract video title')
3902         video_title = result.group('title').strip()
3903
3904         # Get the embed page
3905         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3906         if result is None:
3907             raise ExtractorError(u'ERROR: unable to extract embed page')
3908
3909         embed_page_url = result.group(0).strip()
3910         video_id = result.group('videoid')
3911     
3912         webpage = self._download_webpage(embed_page_url, video_id)
3913
3914         # Get the video URL
3915         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3916         if result is None:
3917             raise ExtractorError(u'ERROR: unable to extract video url')
3918         video_url = result.group('source')
3919
3920         info = {'id': video_id,
3921                 'url': video_url,
3922                 'title': video_title,
3923                 'ext': 'flv',
3924                 'format': 'flv',
3925                 'player_url': embed_page_url}
3926
3927         return [info]
3928
3929 class EightTracksIE(InfoExtractor):
3930     IE_NAME = '8tracks'
3931     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3932
3933     def _real_extract(self, url):
3934         mobj = re.match(self._VALID_URL, url)
3935         if mobj is None:
3936             raise ExtractorError(u'Invalid URL: %s' % url)
3937         playlist_id = mobj.group('id')
3938
3939         webpage = self._download_webpage(url, playlist_id)
3940
3941         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3942         if not m:
3943             raise ExtractorError(u'Cannot find trax information')
3944         json_like = m.group(1)
3945         data = json.loads(json_like)
3946
3947         session = str(random.randint(0, 1000000000))
3948         mix_id = data['id']
3949         track_count = data['tracks_count']
3950         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3951         next_url = first_url
3952         res = []
3953         for i in itertools.count():
3954             api_json = self._download_webpage(next_url, playlist_id,
3955                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3956                 errnote=u'Failed to download song information')
3957             api_data = json.loads(api_json)
3958             track_data = api_data[u'set']['track']
3959             info = {
3960                 'id': track_data['id'],
3961                 'url': track_data['track_file_stream_url'],
3962                 'title': track_data['performer'] + u' - ' + track_data['name'],
3963                 'raw_title': track_data['name'],
3964                 'uploader_id': data['user']['login'],
3965                 'ext': 'm4a',
3966             }
3967             res.append(info)
3968             if api_data['set']['at_last_track']:
3969                 break
3970             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3971         return res
3972
3973 class KeekIE(InfoExtractor):
3974     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3975     IE_NAME = u'keek'
3976
3977     def _real_extract(self, url):
3978         m = re.match(self._VALID_URL, url)
3979         video_id = m.group('videoID')
3980         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3981         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3982         webpage = self._download_webpage(url, video_id)
3983         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3984         title = unescapeHTML(m.group('title'))
3985         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3986         uploader = unescapeHTML(m.group('uploader'))
3987         info = {
3988                 'id':video_id,
3989                 'url':video_url,
3990                 'ext': 'mp4',
3991                 'title': title,
3992                 'thumbnail': thumbnail,
3993                 'uploader': uploader
3994         }
3995         return [info]
3996
3997 class TEDIE(InfoExtractor):
3998     _VALID_URL=r'''http://www.ted.com/
3999                    (
4000                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4001                         |
4002                         ((?P<type_talk>talks)) # We have a simple talk
4003                    )
4004                    /(?P<name>\w+) # Here goes the name and then ".html"
4005                    '''
4006
4007     def suitable(self, url):
4008         """Receives a URL and returns True if suitable for this IE."""
4009         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4010
4011     def _real_extract(self, url):
4012         m=re.match(self._VALID_URL, url, re.VERBOSE)
4013         if m.group('type_talk'):
4014             return [self._talk_info(url)]
4015         else :
4016             playlist_id=m.group('playlist_id')
4017             name=m.group('name')
4018             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4019             return self._playlist_videos_info(url,name,playlist_id)
4020
4021     def _talk_video_link(self,mediaSlug):
4022         '''Returns the video link for that mediaSlug'''
4023         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4024
4025     def _playlist_videos_info(self,url,name,playlist_id=0):
4026         '''Returns the videos of the playlist'''
4027         video_RE=r'''
4028                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4029                      ([.\s]*?)data-playlist_item_id="(\d+)"
4030                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4031                      '''
4032         video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4033         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4034         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4035         m_names=re.finditer(video_name_RE,webpage)
4036         info=[]
4037         for m_video, m_name in zip(m_videos,m_names):
4038             video_dic={
4039                        'id': m_video.group('video_id'),
4040                        'url': self._talk_video_link(m_video.group('mediaSlug')),
4041                        'ext': 'mp4',
4042                        'title': m_name.group('fullname')
4043                        }
4044             info.append(video_dic)
4045         return info
4046     def _talk_info(self, url, video_id=0):
4047         """Return the video for the talk in the url"""
4048         m=re.match(self._VALID_URL, url,re.VERBOSE)
4049         videoName=m.group('name')
4050         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4051         # If the url includes the language we get the title translated
4052         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4053         title=re.search(title_RE, webpage).group('title')
4054         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4055                         "id":(?P<videoID>[\d]+).*?
4056                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4057         info_match=re.search(info_RE,webpage,re.VERBOSE)
4058         video_id=info_match.group('videoID')
4059         mediaSlug=info_match.group('mediaSlug')
4060         video_url=self._talk_video_link(mediaSlug)
4061         info = {
4062                 'id': video_id,
4063                 'url': video_url,
4064                 'ext': 'mp4',
4065                 'title': title
4066                 }
4067         return info
4068
4069 class MySpassIE(InfoExtractor):
4070     _VALID_URL = r'http://www.myspass.de/.*'
4071     
4072     def _real_extract(self, url):
4073         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4074
4075         # video id is the last path element of the URL
4076         # usually there is a trailing slash, so also try the second but last
4077         url_path = compat_urllib_parse_urlparse(url).path
4078         url_parent_path, video_id = os.path.split(url_path)
4079         if not video_id:
4080             _, video_id = os.path.split(url_parent_path)
4081         
4082         # get metadata
4083         metadata_url = META_DATA_URL_TEMPLATE % video_id
4084         metadata_text = self._download_webpage(metadata_url, video_id)
4085         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4086         
4087         # extract values from metadata
4088         url_flv_el = metadata.find('url_flv')
4089         if url_flv_el is None:
4090             self._downloader.trouble(u'ERROR: unable to extract download url')
4091             return
4092         video_url = url_flv_el.text
4093         extension = os.path.splitext(video_url)[1][1:]
4094         title_el = metadata.find('title')
4095         if title_el is None:
4096             self._downloader.trouble(u'ERROR: unable to extract title')
4097             return
4098         title = title_el.text
4099         format_id_el = metadata.find('format_id')
4100         if format_id_el is None:
4101             format = ext
4102         else:
4103             format = format_id_el.text
4104         description_el = metadata.find('description')
4105         if description_el is not None:
4106             description = description_el.text
4107         else:
4108             description = None
4109         imagePreview_el = metadata.find('imagePreview')
4110         if imagePreview_el is not None:
4111             thumbnail = imagePreview_el.text
4112         else:
4113             thumbnail = None
4114         info = {
4115             'id': video_id,
4116             'url': video_url,
4117             'title': title,
4118             'ext': extension,
4119             'format': format,
4120             'thumbnail': thumbnail,
4121             'description': description
4122         }
4123         return [info]
4124
4125 def gen_extractors():
4126     """ Return a list of an instance of every supported extractor.
4127     The order does matter; the first extractor matched is the one handling the URL.
4128     """
4129     return [
4130         YoutubePlaylistIE(),
4131         YoutubeChannelIE(),
4132         YoutubeUserIE(),
4133         YoutubeSearchIE(),
4134         YoutubeIE(),
4135         MetacafeIE(),
4136         DailymotionIE(),
4137         GoogleSearchIE(),
4138         PhotobucketIE(),
4139         YahooIE(),
4140         YahooSearchIE(),
4141         DepositFilesIE(),
4142         FacebookIE(),
4143         BlipTVUserIE(),
4144         BlipTVIE(),
4145         VimeoIE(),
4146         MyVideoIE(),
4147         ComedyCentralIE(),
4148         EscapistIE(),
4149         CollegeHumorIE(),
4150         XVideosIE(),
4151         SoundcloudIE(),
4152         InfoQIE(),
4153         MixcloudIE(),
4154         StanfordOpenClassroomIE(),
4155         MTVIE(),
4156         YoukuIE(),
4157         XNXXIE(),
4158         YouJizzIE(),
4159         PornotubeIE(),
4160         YouPornIE(),
4161         GooglePlusIE(),
4162         ArteTvIE(),
4163         NBAIE(),
4164         JustinTVIE(),
4165         FunnyOrDieIE(),
4166         TweetReelIE(),
4167         SteamIE(),
4168         UstreamIE(),
4169         RBMARadioIE(),
4170         EightTracksIE(),
4171         KeekIE(),
4172         TEDIE(),
4173         MySpassIE(),
4174         GenericIE()
4175     ]
4176
4177