Add auxiliary methods to InfoExtractor to set the '_type' key and use them for some...
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The .srt file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         webpage_bytes = urlh.read()
130         return webpage_bytes.decode('utf-8', 'replace')
131         
132     #Methods for following #608
133     #They set the correct value of the '_type' key
134     def video_result(self, video_info):
135         """Returns a video"""
136         video_info['_type'] = 'video'
137         return video_info
138     def url_result(self, url, ie=None):
139         """Returns a url that points to a page that should be processed"""
140         #TODO: ie should be the class used for getting the info
141         video_info = {'_type': 'url',
142                       'url': url}
143         return video_info
144     def playlist_result(self, entries):
145         """Returns a playlist"""
146         video_info = {'_type': 'playlist',
147                       'entries': entries}
148         return video_info
149
150
151 class YoutubeIE(InfoExtractor):
152     """Information extractor for youtube.com."""
153
154     _VALID_URL = r"""^
155                      (
156                          (?:https?://)?                                       # http(s):// (optional)
157                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
158                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
159                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
160                          (?:                                                  # the various things that can precede the ID:
161                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
162                              |(?:                                             # or the v= param in all its forms
163                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
164                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
165                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
166                                  v=
167                              )
168                          )?                                                   # optional -> youtube.com/xxxx is OK
169                      )?                                                       # all until now is optional -> you can pass the naked ID
170                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
171                      (?(1).+)?                                                # if we found the ID, everything can follow
172                      $"""
173     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
174     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
175     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
176     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
177     _NETRC_MACHINE = 'youtube'
178     # Listed in order of quality
179     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
180     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
181     _video_extensions = {
182         '13': '3gp',
183         '17': 'mp4',
184         '18': 'mp4',
185         '22': 'mp4',
186         '37': 'mp4',
187         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
188         '43': 'webm',
189         '44': 'webm',
190         '45': 'webm',
191         '46': 'webm',
192     }
193     _video_dimensions = {
194         '5': '240x400',
195         '6': '???',
196         '13': '???',
197         '17': '144x176',
198         '18': '360x640',
199         '22': '720x1280',
200         '34': '360x640',
201         '35': '480x854',
202         '37': '1080x1920',
203         '38': '3072x4096',
204         '43': '360x640',
205         '44': '480x854',
206         '45': '720x1280',
207         '46': '1080x1920',
208     }
209     IE_NAME = u'youtube'
210
211     @classmethod
212     def suitable(cls, url):
213         """Receives a URL and returns True if suitable for this IE."""
214         if YoutubePlaylistIE.suitable(url): return False
215         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
216
217     def report_lang(self):
218         """Report attempt to set language."""
219         self._downloader.to_screen(u'[youtube] Setting language')
220
221     def report_login(self):
222         """Report attempt to log in."""
223         self._downloader.to_screen(u'[youtube] Logging in')
224
225     def report_age_confirmation(self):
226         """Report attempt to confirm age."""
227         self._downloader.to_screen(u'[youtube] Confirming age')
228
229     def report_video_webpage_download(self, video_id):
230         """Report attempt to download video webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
232
233     def report_video_info_webpage_download(self, video_id):
234         """Report attempt to download video info webpage."""
235         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
236
237     def report_video_subtitles_download(self, video_id):
238         """Report attempt to download video info webpage."""
239         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
240
241     def report_information_extraction(self, video_id):
242         """Report attempt to extract video information."""
243         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
244
245     def report_unavailable_format(self, video_id, format):
246         """Report extracted video URL."""
247         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
248
249     def report_rtmp_download(self):
250         """Indicate the download will use the RTMP protocol."""
251         self._downloader.to_screen(u'[youtube] RTMP download detected')
252
253     def _closed_captions_xml_to_srt(self, xml_string):
254         srt = ''
255         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
256         # TODO parse xml instead of regex
257         for n, (start, dur_tag, dur, caption) in enumerate(texts):
258             if not dur: dur = '4'
259             start = float(start)
260             end = start + float(dur)
261             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
262             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
263             caption = unescapeHTML(caption)
264             caption = unescapeHTML(caption) # double cycle, intentional
265             srt += str(n+1) + '\n'
266             srt += start + ' --> ' + end + '\n'
267             srt += caption + '\n\n'
268         return srt
269
270     def _extract_subtitles(self, video_id):
271         self.report_video_subtitles_download(video_id)
272         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
273         try:
274             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
278         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
279         if not srt_lang_list:
280             return (u'WARNING: video has no closed captions', None)
281         if self._downloader.params.get('subtitleslang', False):
282             srt_lang = self._downloader.params.get('subtitleslang')
283         elif 'en' in srt_lang_list:
284             srt_lang = 'en'
285         else:
286             srt_lang = list(srt_lang_list.keys())[0]
287         if not srt_lang in srt_lang_list:
288             return (u'WARNING: no closed captions found in the specified language', None)
289         params = compat_urllib_parse.urlencode({
290             'lang': srt_lang,
291             'name': srt_lang_list[srt_lang].encode('utf-8'),
292             'v': video_id,
293         })
294         url = 'http://www.youtube.com/api/timedtext?' + params
295         try:
296             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
297         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
298             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
299         if not srt_xml:
300             return (u'WARNING: Did not fetch video subtitles', None)
301         return (None, self._closed_captions_xml_to_srt(srt_xml))
302
303     def _print_formats(self, formats):
304         print('Available formats:')
305         for x in formats:
306             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
307
308     def _real_initialize(self):
309         if self._downloader is None:
310             return
311
312         username = None
313         password = None
314         downloader_params = self._downloader.params
315
316         # Attempt to use provided username and password or .netrc data
317         if downloader_params.get('username', None) is not None:
318             username = downloader_params['username']
319             password = downloader_params['password']
320         elif downloader_params.get('usenetrc', False):
321             try:
322                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
323                 if info is not None:
324                     username = info[0]
325                     password = info[2]
326                 else:
327                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
328             except (IOError, netrc.NetrcParseError) as err:
329                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
330                 return
331
332         # Set language
333         request = compat_urllib_request.Request(self._LANG_URL)
334         try:
335             self.report_lang()
336             compat_urllib_request.urlopen(request).read()
337         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
338             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
339             return
340
341         # No authentication to be performed
342         if username is None:
343             return
344
345         request = compat_urllib_request.Request(self._LOGIN_URL)
346         try:
347             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
348         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
349             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
350             return
351
352         galx = None
353         dsh = None
354         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
355         if match:
356           galx = match.group(1)
357
358         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
359         if match:
360           dsh = match.group(1)
361
362         # Log in
363         login_form_strs = {
364                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
365                 u'Email': username,
366                 u'GALX': galx,
367                 u'Passwd': password,
368                 u'PersistentCookie': u'yes',
369                 u'_utf8': u'霱',
370                 u'bgresponse': u'js_disabled',
371                 u'checkConnection': u'',
372                 u'checkedDomains': u'youtube',
373                 u'dnConn': u'',
374                 u'dsh': dsh,
375                 u'pstMsg': u'0',
376                 u'rmShown': u'1',
377                 u'secTok': u'',
378                 u'signIn': u'Sign in',
379                 u'timeStmp': u'',
380                 u'service': u'youtube',
381                 u'uilel': u'3',
382                 u'hl': u'en_US',
383         }
384         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
385         # chokes on unicode
386         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
387         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
388         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
389         try:
390             self.report_login()
391             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
392             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
393                 self._downloader.report_warning(u'unable to log in: bad username or password')
394                 return
395         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
397             return
398
399         # Confirm age
400         age_form = {
401                 'next_url':     '/',
402                 'action_confirm':   'Confirm',
403                 }
404         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
405         try:
406             self.report_age_confirmation()
407             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
408         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
409             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
410             return
411
412     def _extract_id(self, url):
413         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
414         if mobj is None:
415             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
416             return
417         video_id = mobj.group(2)
418         return video_id
419
420     def _real_extract(self, url):
421         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
422         mobj = re.search(self._NEXT_URL_RE, url)
423         if mobj:
424             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
425         video_id = self._extract_id(url)
426
427         # Get video webpage
428         self.report_video_webpage_download(video_id)
429         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
430         request = compat_urllib_request.Request(url)
431         try:
432             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
433         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
434             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
435             return
436
437         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
438
439         # Attempt to extract SWF player URL
440         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
441         if mobj is not None:
442             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
443         else:
444             player_url = None
445
446         # Get video info
447         self.report_video_info_webpage_download(video_id)
448         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
449             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
450                     % (video_id, el_type))
451             request = compat_urllib_request.Request(video_info_url)
452             try:
453                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
454                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
455                 video_info = compat_parse_qs(video_info_webpage)
456                 if 'token' in video_info:
457                     break
458             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
459                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
460                 return
461         if 'token' not in video_info:
462             if 'reason' in video_info:
463                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
464             else:
465                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
466             return
467
468         # Check for "rental" videos
469         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
470             self._downloader.trouble(u'ERROR: "rental" videos not supported')
471             return
472
473         # Start extracting information
474         self.report_information_extraction(video_id)
475
476         # uploader
477         if 'author' not in video_info:
478             self._downloader.trouble(u'ERROR: unable to extract uploader name')
479             return
480         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
481
482         # uploader_id
483         video_uploader_id = None
484         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
485         if mobj is not None:
486             video_uploader_id = mobj.group(1)
487         else:
488             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
489
490         # title
491         if 'title' not in video_info:
492             self._downloader.trouble(u'ERROR: unable to extract video title')
493             return
494         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
495
496         # thumbnail image
497         if 'thumbnail_url' not in video_info:
498             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
499             video_thumbnail = ''
500         else:   # don't panic if we can't find it
501             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
502
503         # upload date
504         upload_date = None
505         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
506         if mobj is not None:
507             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
508             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
509             for expression in format_expressions:
510                 try:
511                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
512                 except:
513                     pass
514
515         # description
516         video_description = get_element_by_id("eow-description", video_webpage)
517         if video_description:
518             video_description = clean_html(video_description)
519         else:
520             video_description = ''
521
522         # closed captions
523         video_subtitles = None
524         if self._downloader.params.get('writesubtitles', False):
525             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
526             if srt_error:
527                 self._downloader.trouble(srt_error)
528
529         if 'length_seconds' not in video_info:
530             self._downloader.trouble(u'WARNING: unable to extract video duration')
531             video_duration = ''
532         else:
533             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
534
535         # token
536         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
537
538         # Decide which formats to download
539         req_format = self._downloader.params.get('format', None)
540
541         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
542             self.report_rtmp_download()
543             video_url_list = [(None, video_info['conn'][0])]
544         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
545             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
546             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
547             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
548             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
549
550             format_limit = self._downloader.params.get('format_limit', None)
551             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
552             if format_limit is not None and format_limit in available_formats:
553                 format_list = available_formats[available_formats.index(format_limit):]
554             else:
555                 format_list = available_formats
556             existing_formats = [x for x in format_list if x in url_map]
557             if len(existing_formats) == 0:
558                 self._downloader.trouble(u'ERROR: no known formats available for video')
559                 return
560             if self._downloader.params.get('listformats', None):
561                 self._print_formats(existing_formats)
562                 return
563             if req_format is None or req_format == 'best':
564                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
565             elif req_format == 'worst':
566                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
567             elif req_format in ('-1', 'all'):
568                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
569             else:
570                 # Specific formats. We pick the first in a slash-delimeted sequence.
571                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
572                 req_formats = req_format.split('/')
573                 video_url_list = None
574                 for rf in req_formats:
575                     if rf in url_map:
576                         video_url_list = [(rf, url_map[rf])]
577                         break
578                 if video_url_list is None:
579                     self._downloader.trouble(u'ERROR: requested format not available')
580                     return
581         else:
582             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
583             return
584
585         results = []
586         for format_param, video_real_url in video_url_list:
587             # Extension
588             video_extension = self._video_extensions.get(format_param, 'flv')
589
590             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
591                                               self._video_dimensions.get(format_param, '???'))
592
593             results.append({
594                 'id':       video_id,
595                 'url':      video_real_url,
596                 'uploader': video_uploader,
597                 'uploader_id': video_uploader_id,
598                 'upload_date':  upload_date,
599                 'title':    video_title,
600                 'ext':      video_extension,
601                 'format':   video_format,
602                 'thumbnail':    video_thumbnail,
603                 'description':  video_description,
604                 'player_url':   player_url,
605                 'subtitles':    video_subtitles,
606                 'duration':     video_duration
607             })
608         return results
609
610
611 class MetacafeIE(InfoExtractor):
612     """Information Extractor for metacafe.com."""
613
614     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
615     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
616     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
617     IE_NAME = u'metacafe'
618
619     def __init__(self, downloader=None):
620         InfoExtractor.__init__(self, downloader)
621
622     def report_disclaimer(self):
623         """Report disclaimer retrieval."""
624         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
625
626     def report_age_confirmation(self):
627         """Report attempt to confirm age."""
628         self._downloader.to_screen(u'[metacafe] Confirming age')
629
630     def report_download_webpage(self, video_id):
631         """Report webpage download."""
632         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
633
634     def report_extraction(self, video_id):
635         """Report information extraction."""
636         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
637
638     def _real_initialize(self):
639         # Retrieve disclaimer
640         request = compat_urllib_request.Request(self._DISCLAIMER)
641         try:
642             self.report_disclaimer()
643             disclaimer = compat_urllib_request.urlopen(request).read()
644         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
645             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
646             return
647
648         # Confirm age
649         disclaimer_form = {
650             'filters': '0',
651             'submit': "Continue - I'm over 18",
652             }
653         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
654         try:
655             self.report_age_confirmation()
656             disclaimer = compat_urllib_request.urlopen(request).read()
657         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
658             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
659             return
660
661     def _real_extract(self, url):
662         # Extract id and simplified title from URL
663         mobj = re.match(self._VALID_URL, url)
664         if mobj is None:
665             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
666             return
667
668         video_id = mobj.group(1)
669
670         # Check if video comes from YouTube
671         mobj2 = re.match(r'^yt-(.*)$', video_id)
672         if mobj2 is not None:
673             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
674             return
675
676         # Retrieve video webpage to extract further information
677         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
678         try:
679             self.report_download_webpage(video_id)
680             webpage = compat_urllib_request.urlopen(request).read()
681         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
682             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
683             return
684
685         # Extract URL, uploader and title from webpage
686         self.report_extraction(video_id)
687         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
688         if mobj is not None:
689             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
690             video_extension = mediaURL[-3:]
691
692             # Extract gdaKey if available
693             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
694             if mobj is None:
695                 video_url = mediaURL
696             else:
697                 gdaKey = mobj.group(1)
698                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
699         else:
700             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
701             if mobj is None:
702                 self._downloader.trouble(u'ERROR: unable to extract media URL')
703                 return
704             vardict = compat_parse_qs(mobj.group(1))
705             if 'mediaData' not in vardict:
706                 self._downloader.trouble(u'ERROR: unable to extract media URL')
707                 return
708             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
709             if mobj is None:
710                 self._downloader.trouble(u'ERROR: unable to extract media URL')
711                 return
712             mediaURL = mobj.group(1).replace('\\/', '/')
713             video_extension = mediaURL[-3:]
714             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
715
716         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
717         if mobj is None:
718             self._downloader.trouble(u'ERROR: unable to extract title')
719             return
720         video_title = mobj.group(1).decode('utf-8')
721
722         mobj = re.search(r'submitter=(.*?);', webpage)
723         if mobj is None:
724             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
725             return
726         video_uploader = mobj.group(1)
727
728         return [{
729             'id':       video_id.decode('utf-8'),
730             'url':      video_url.decode('utf-8'),
731             'uploader': video_uploader.decode('utf-8'),
732             'upload_date':  None,
733             'title':    video_title,
734             'ext':      video_extension.decode('utf-8'),
735         }]
736
737
738 class DailymotionIE(InfoExtractor):
739     """Information Extractor for Dailymotion"""
740
741     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
742     IE_NAME = u'dailymotion'
743     _WORKING = False
744
745     def __init__(self, downloader=None):
746         InfoExtractor.__init__(self, downloader)
747
748     def report_extraction(self, video_id):
749         """Report information extraction."""
750         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
751
752     def _real_extract(self, url):
753         # Extract id and simplified title from URL
754         mobj = re.match(self._VALID_URL, url)
755         if mobj is None:
756             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
757             return
758
759         video_id = mobj.group(1).split('_')[0].split('?')[0]
760
761         video_extension = 'mp4'
762
763         # Retrieve video webpage to extract further information
764         request = compat_urllib_request.Request(url)
765         request.add_header('Cookie', 'family_filter=off')
766         webpage = self._download_webpage(request, video_id)
767
768         # Extract URL, uploader and title from webpage
769         self.report_extraction(video_id)
770         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
771         if mobj is None:
772             self._downloader.trouble(u'ERROR: unable to extract media URL')
773             return
774         flashvars = compat_urllib_parse.unquote(mobj.group(1))
775
776         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
777             if key in flashvars:
778                 max_quality = key
779                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
780                 break
781         else:
782             self._downloader.trouble(u'ERROR: unable to extract video URL')
783             return
784
785         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
786         if mobj is None:
787             self._downloader.trouble(u'ERROR: unable to extract video URL')
788             return
789
790         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
791
792         # TODO: support choosing qualities
793
794         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
795         if mobj is None:
796             self._downloader.trouble(u'ERROR: unable to extract title')
797             return
798         video_title = unescapeHTML(mobj.group('title'))
799
800         video_uploader = None
801         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
802         if mobj is None:
803             # lookin for official user
804             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
805             if mobj_official is None:
806                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
807             else:
808                 video_uploader = mobj_official.group(1)
809         else:
810             video_uploader = mobj.group(1)
811
812         video_upload_date = None
813         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
814         if mobj is not None:
815             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
816
817         return [{
818             'id':       video_id,
819             'url':      video_url,
820             'uploader': video_uploader,
821             'upload_date':  video_upload_date,
822             'title':    video_title,
823             'ext':      video_extension,
824         }]
825
826
827 class PhotobucketIE(InfoExtractor):
828     """Information extractor for photobucket.com."""
829
830     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831     IE_NAME = u'photobucket'
832
833     def __init__(self, downloader=None):
834         InfoExtractor.__init__(self, downloader)
835
836     def report_download_webpage(self, video_id):
837         """Report webpage download."""
838         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
839
840     def report_extraction(self, video_id):
841         """Report information extraction."""
842         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
843
844     def _real_extract(self, url):
845         # Extract id from URL
846         mobj = re.match(self._VALID_URL, url)
847         if mobj is None:
848             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
849             return
850
851         video_id = mobj.group(1)
852
853         video_extension = 'flv'
854
855         # Retrieve video webpage to extract further information
856         request = compat_urllib_request.Request(url)
857         try:
858             self.report_download_webpage(video_id)
859             webpage = compat_urllib_request.urlopen(request).read()
860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
862             return
863
864         # Extract URL, uploader, and title from webpage
865         self.report_extraction(video_id)
866         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
867         if mobj is None:
868             self._downloader.trouble(u'ERROR: unable to extract media URL')
869             return
870         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
871
872         video_url = mediaURL
873
874         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
875         if mobj is None:
876             self._downloader.trouble(u'ERROR: unable to extract title')
877             return
878         video_title = mobj.group(1).decode('utf-8')
879
880         video_uploader = mobj.group(2).decode('utf-8')
881
882         return [{
883             'id':       video_id.decode('utf-8'),
884             'url':      video_url.decode('utf-8'),
885             'uploader': video_uploader,
886             'upload_date':  None,
887             'title':    video_title,
888             'ext':      video_extension.decode('utf-8'),
889         }]
890
891
892 class YahooIE(InfoExtractor):
893     """Information extractor for video.yahoo.com."""
894
895     _WORKING = False
896     # _VALID_URL matches all Yahoo! Video URLs
897     # _VPAGE_URL matches only the extractable '/watch/' URLs
898     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
899     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
900     IE_NAME = u'video.yahoo'
901
902     def __init__(self, downloader=None):
903         InfoExtractor.__init__(self, downloader)
904
905     def report_download_webpage(self, video_id):
906         """Report webpage download."""
907         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
908
909     def report_extraction(self, video_id):
910         """Report information extraction."""
911         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
912
913     def _real_extract(self, url, new_video=True):
914         # Extract ID from URL
915         mobj = re.match(self._VALID_URL, url)
916         if mobj is None:
917             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
918             return
919
920         video_id = mobj.group(2)
921         video_extension = 'flv'
922
923         # Rewrite valid but non-extractable URLs as
924         # extractable English language /watch/ URLs
925         if re.match(self._VPAGE_URL, url) is None:
926             request = compat_urllib_request.Request(url)
927             try:
928                 webpage = compat_urllib_request.urlopen(request).read()
929             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
930                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
931                 return
932
933             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
934             if mobj is None:
935                 self._downloader.trouble(u'ERROR: Unable to extract id field')
936                 return
937             yahoo_id = mobj.group(1)
938
939             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
940             if mobj is None:
941                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
942                 return
943             yahoo_vid = mobj.group(1)
944
945             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
946             return self._real_extract(url, new_video=False)
947
948         # Retrieve video webpage to extract further information
949         request = compat_urllib_request.Request(url)
950         try:
951             self.report_download_webpage(video_id)
952             webpage = compat_urllib_request.urlopen(request).read()
953         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
954             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
955             return
956
957         # Extract uploader and title from webpage
958         self.report_extraction(video_id)
959         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
960         if mobj is None:
961             self._downloader.trouble(u'ERROR: unable to extract video title')
962             return
963         video_title = mobj.group(1).decode('utf-8')
964
965         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
966         if mobj is None:
967             self._downloader.trouble(u'ERROR: unable to extract video uploader')
968             return
969         video_uploader = mobj.group(1).decode('utf-8')
970
971         # Extract video thumbnail
972         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
973         if mobj is None:
974             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
975             return
976         video_thumbnail = mobj.group(1).decode('utf-8')
977
978         # Extract video description
979         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
980         if mobj is None:
981             self._downloader.trouble(u'ERROR: unable to extract video description')
982             return
983         video_description = mobj.group(1).decode('utf-8')
984         if not video_description:
985             video_description = 'No description available.'
986
987         # Extract video height and width
988         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
989         if mobj is None:
990             self._downloader.trouble(u'ERROR: unable to extract video height')
991             return
992         yv_video_height = mobj.group(1)
993
994         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
995         if mobj is None:
996             self._downloader.trouble(u'ERROR: unable to extract video width')
997             return
998         yv_video_width = mobj.group(1)
999
1000         # Retrieve video playlist to extract media URL
1001         # I'm not completely sure what all these options are, but we
1002         # seem to need most of them, otherwise the server sends a 401.
1003         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1004         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1005         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1006                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1007                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1008         try:
1009             self.report_download_webpage(video_id)
1010             webpage = compat_urllib_request.urlopen(request).read()
1011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1012             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1013             return
1014
1015         # Extract media URL from playlist XML
1016         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1017         if mobj is None:
1018             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1019             return
1020         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1021         video_url = unescapeHTML(video_url)
1022
1023         return [{
1024             'id':       video_id.decode('utf-8'),
1025             'url':      video_url,
1026             'uploader': video_uploader,
1027             'upload_date':  None,
1028             'title':    video_title,
1029             'ext':      video_extension.decode('utf-8'),
1030             'thumbnail':    video_thumbnail.decode('utf-8'),
1031             'description':  video_description,
1032         }]
1033
1034
1035 class VimeoIE(InfoExtractor):
1036     """Information extractor for vimeo.com."""
1037
1038     # _VALID_URL matches Vimeo URLs
1039     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1040     IE_NAME = u'vimeo'
1041
1042     def __init__(self, downloader=None):
1043         InfoExtractor.__init__(self, downloader)
1044
1045     def report_download_webpage(self, video_id):
1046         """Report webpage download."""
1047         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1048
1049     def report_extraction(self, video_id):
1050         """Report information extraction."""
1051         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1052
1053     def _real_extract(self, url, new_video=True):
1054         # Extract ID from URL
1055         mobj = re.match(self._VALID_URL, url)
1056         if mobj is None:
1057             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1058             return
1059
1060         video_id = mobj.group('id')
1061         if not mobj.group('proto'):
1062             url = 'https://' + url
1063         if mobj.group('direct_link'):
1064             url = 'https://vimeo.com/' + video_id
1065
1066         # Retrieve video webpage to extract further information
1067         request = compat_urllib_request.Request(url, None, std_headers)
1068         try:
1069             self.report_download_webpage(video_id)
1070             webpage_bytes = compat_urllib_request.urlopen(request).read()
1071             webpage = webpage_bytes.decode('utf-8')
1072         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1073             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1074             return
1075
1076         # Now we begin extracting as much information as we can from what we
1077         # retrieved. First we extract the information common to all extractors,
1078         # and latter we extract those that are Vimeo specific.
1079         self.report_extraction(video_id)
1080
1081         # Extract the config JSON
1082         try:
1083             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1084             config = json.loads(config)
1085         except:
1086             self._downloader.trouble(u'ERROR: unable to extract info section')
1087             return
1088
1089         # Extract title
1090         video_title = config["video"]["title"]
1091
1092         # Extract uploader and uploader_id
1093         video_uploader = config["video"]["owner"]["name"]
1094         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1095
1096         # Extract video thumbnail
1097         video_thumbnail = config["video"]["thumbnail"]
1098
1099         # Extract video description
1100         video_description = get_element_by_attribute("itemprop", "description", webpage)
1101         if video_description: video_description = clean_html(video_description)
1102         else: video_description = ''
1103
1104         # Extract upload date
1105         video_upload_date = None
1106         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1107         if mobj is not None:
1108             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1109
1110         # Vimeo specific: extract request signature and timestamp
1111         sig = config['request']['signature']
1112         timestamp = config['request']['timestamp']
1113
1114         # Vimeo specific: extract video codec and quality information
1115         # First consider quality, then codecs, then take everything
1116         # TODO bind to format param
1117         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1118         files = { 'hd': [], 'sd': [], 'other': []}
1119         for codec_name, codec_extension in codecs:
1120             if codec_name in config["video"]["files"]:
1121                 if 'hd' in config["video"]["files"][codec_name]:
1122                     files['hd'].append((codec_name, codec_extension, 'hd'))
1123                 elif 'sd' in config["video"]["files"][codec_name]:
1124                     files['sd'].append((codec_name, codec_extension, 'sd'))
1125                 else:
1126                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1127
1128         for quality in ('hd', 'sd', 'other'):
1129             if len(files[quality]) > 0:
1130                 video_quality = files[quality][0][2]
1131                 video_codec = files[quality][0][0]
1132                 video_extension = files[quality][0][1]
1133                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1134                 break
1135         else:
1136             self._downloader.trouble(u'ERROR: no known codec found')
1137             return
1138
1139         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1140                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1141
1142         return [{
1143             'id':       video_id,
1144             'url':      video_url,
1145             'uploader': video_uploader,
1146             'uploader_id': video_uploader_id,
1147             'upload_date':  video_upload_date,
1148             'title':    video_title,
1149             'ext':      video_extension,
1150             'thumbnail':    video_thumbnail,
1151             'description':  video_description,
1152         }]
1153
1154
1155 class ArteTvIE(InfoExtractor):
1156     """arte.tv information extractor."""
1157
1158     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1159     _LIVE_URL = r'index-[0-9]+\.html$'
1160
1161     IE_NAME = u'arte.tv'
1162
1163     def __init__(self, downloader=None):
1164         InfoExtractor.__init__(self, downloader)
1165
1166     def report_download_webpage(self, video_id):
1167         """Report webpage download."""
1168         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1169
1170     def report_extraction(self, video_id):
1171         """Report information extraction."""
1172         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1173
1174     def fetch_webpage(self, url):
1175         request = compat_urllib_request.Request(url)
1176         try:
1177             self.report_download_webpage(url)
1178             webpage = compat_urllib_request.urlopen(request).read()
1179         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1180             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1181             return
1182         except ValueError as err:
1183             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184             return
1185         return webpage
1186
1187     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1188         page = self.fetch_webpage(url)
1189         mobj = re.search(regex, page, regexFlags)
1190         info = {}
1191
1192         if mobj is None:
1193             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1194             return
1195
1196         for (i, key, err) in matchTuples:
1197             if mobj.group(i) is None:
1198                 self._downloader.trouble(err)
1199                 return
1200             else:
1201                 info[key] = mobj.group(i)
1202
1203         return info
1204
1205     def extractLiveStream(self, url):
1206         video_lang = url.split('/')[-4]
1207         info = self.grep_webpage(
1208             url,
1209             r'src="(.*?/videothek_js.*?\.js)',
1210             0,
1211             [
1212                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1213             ]
1214         )
1215         http_host = url.split('/')[2]
1216         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1217         info = self.grep_webpage(
1218             next_url,
1219             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1220                 '(http://.*?\.swf).*?' +
1221                 '(rtmp://.*?)\'',
1222             re.DOTALL,
1223             [
1224                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1225                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1226                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1227             ]
1228         )
1229         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1230
1231     def extractPlus7Stream(self, url):
1232         video_lang = url.split('/')[-3]
1233         info = self.grep_webpage(
1234             url,
1235             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1236             0,
1237             [
1238                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1239             ]
1240         )
1241         next_url = compat_urllib_parse.unquote(info.get('url'))
1242         info = self.grep_webpage(
1243             next_url,
1244             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1245             0,
1246             [
1247                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1248             ]
1249         )
1250         next_url = compat_urllib_parse.unquote(info.get('url'))
1251
1252         info = self.grep_webpage(
1253             next_url,
1254             r'<video id="(.*?)".*?>.*?' +
1255                 '<name>(.*?)</name>.*?' +
1256                 '<dateVideo>(.*?)</dateVideo>.*?' +
1257                 '<url quality="hd">(.*?)</url>',
1258             re.DOTALL,
1259             [
1260                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1261                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1262                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1263                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1264             ]
1265         )
1266
1267         return {
1268             'id':           info.get('id'),
1269             'url':          compat_urllib_parse.unquote(info.get('url')),
1270             'uploader':     u'arte.tv',
1271             'upload_date':  info.get('date'),
1272             'title':        info.get('title').decode('utf-8'),
1273             'ext':          u'mp4',
1274             'format':       u'NA',
1275             'player_url':   None,
1276         }
1277
1278     def _real_extract(self, url):
1279         video_id = url.split('/')[-1]
1280         self.report_extraction(video_id)
1281
1282         if re.search(self._LIVE_URL, video_id) is not None:
1283             self.extractLiveStream(url)
1284             return
1285         else:
1286             info = self.extractPlus7Stream(url)
1287
1288         return [info]
1289
1290
1291 class GenericIE(InfoExtractor):
1292     """Generic last-resort information extractor."""
1293
1294     _VALID_URL = r'.*'
1295     IE_NAME = u'generic'
1296
1297     def __init__(self, downloader=None):
1298         InfoExtractor.__init__(self, downloader)
1299
1300     def report_download_webpage(self, video_id):
1301         """Report webpage download."""
1302         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1303         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1304
1305     def report_extraction(self, video_id):
1306         """Report information extraction."""
1307         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1308
1309     def report_following_redirect(self, new_url):
1310         """Report information extraction."""
1311         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1312
1313     def _test_redirect(self, url):
1314         """Check if it is a redirect, like url shorteners, in case restart chain."""
1315         class HeadRequest(compat_urllib_request.Request):
1316             def get_method(self):
1317                 return "HEAD"
1318
1319         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1320             """
1321             Subclass the HTTPRedirectHandler to make it use our
1322             HeadRequest also on the redirected URL
1323             """
1324             def redirect_request(self, req, fp, code, msg, headers, newurl):
1325                 if code in (301, 302, 303, 307):
1326                     newurl = newurl.replace(' ', '%20')
1327                     newheaders = dict((k,v) for k,v in req.headers.items()
1328                                       if k.lower() not in ("content-length", "content-type"))
1329                     return HeadRequest(newurl,
1330                                        headers=newheaders,
1331                                        origin_req_host=req.get_origin_req_host(),
1332                                        unverifiable=True)
1333                 else:
1334                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1335
1336         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1337             """
1338             Fallback to GET if HEAD is not allowed (405 HTTP error)
1339             """
1340             def http_error_405(self, req, fp, code, msg, headers):
1341                 fp.read()
1342                 fp.close()
1343
1344                 newheaders = dict((k,v) for k,v in req.headers.items()
1345                                   if k.lower() not in ("content-length", "content-type"))
1346                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1347                                                  headers=newheaders,
1348                                                  origin_req_host=req.get_origin_req_host(),
1349                                                  unverifiable=True))
1350
1351         # Build our opener
1352         opener = compat_urllib_request.OpenerDirector()
1353         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1354                         HTTPMethodFallback, HEADRedirectHandler,
1355                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1356             opener.add_handler(handler())
1357
1358         response = opener.open(HeadRequest(url))
1359         new_url = response.geturl()
1360
1361         if url == new_url:
1362             return False
1363
1364         self.report_following_redirect(new_url)
1365         self._downloader.download([new_url])
1366         return True
1367
1368     def _real_extract(self, url):
1369         if self._test_redirect(url): return
1370
1371         video_id = url.split('/')[-1]
1372         request = compat_urllib_request.Request(url)
1373         try:
1374             self.report_download_webpage(video_id)
1375             webpage = compat_urllib_request.urlopen(request).read()
1376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1377             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1378             return
1379         except ValueError as err:
1380             # since this is the last-resort InfoExtractor, if
1381             # this error is thrown, it'll be thrown here
1382             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383             return
1384
1385         self.report_extraction(video_id)
1386         # Start with something easy: JW Player in SWFObject
1387         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1388         if mobj is None:
1389             # Broaden the search a little bit
1390             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1391         if mobj is None:
1392             # Broaden the search a little bit: JWPlayer JS loader
1393             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1394         if mobj is None:
1395             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1396             return
1397
1398         # It's possible that one of the regexes
1399         # matched, but returned an empty group:
1400         if mobj.group(1) is None:
1401             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1402             return
1403
1404         video_url = compat_urllib_parse.unquote(mobj.group(1))
1405         video_id = os.path.basename(video_url)
1406
1407         # here's a fun little line of code for you:
1408         video_extension = os.path.splitext(video_id)[1][1:]
1409         video_id = os.path.splitext(video_id)[0]
1410
1411         # it's tempting to parse this further, but you would
1412         # have to take into account all the variations like
1413         #   Video Title - Site Name
1414         #   Site Name | Video Title
1415         #   Video Title - Tagline | Site Name
1416         # and so on and so forth; it's just not practical
1417         mobj = re.search(r'<title>(.*)</title>', webpage)
1418         if mobj is None:
1419             self._downloader.trouble(u'ERROR: unable to extract title')
1420             return
1421         video_title = mobj.group(1)
1422
1423         # video uploader is domain name
1424         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1425         if mobj is None:
1426             self._downloader.trouble(u'ERROR: unable to extract title')
1427             return
1428         video_uploader = mobj.group(1)
1429
1430         return [{
1431             'id':       video_id,
1432             'url':      video_url,
1433             'uploader': video_uploader,
1434             'upload_date':  None,
1435             'title':    video_title,
1436             'ext':      video_extension,
1437         }]
1438
1439
1440 class YoutubeSearchIE(InfoExtractor):
1441     """Information Extractor for YouTube search queries."""
1442     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1443     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1444     _max_youtube_results = 1000
1445     IE_NAME = u'youtube:search'
1446
1447     def __init__(self, downloader=None):
1448         InfoExtractor.__init__(self, downloader)
1449
1450     def report_download_page(self, query, pagenum):
1451         """Report attempt to download search page with given number."""
1452         query = query.decode(preferredencoding())
1453         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1454
1455     def _real_extract(self, query):
1456         mobj = re.match(self._VALID_URL, query)
1457         if mobj is None:
1458             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1459             return
1460
1461         prefix, query = query.split(':')
1462         prefix = prefix[8:]
1463         query = query.encode('utf-8')
1464         if prefix == '':
1465             self._download_n_results(query, 1)
1466             return
1467         elif prefix == 'all':
1468             self._download_n_results(query, self._max_youtube_results)
1469             return
1470         else:
1471             try:
1472                 n = int(prefix)
1473                 if n <= 0:
1474                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1475                     return
1476                 elif n > self._max_youtube_results:
1477                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1478                     n = self._max_youtube_results
1479                 self._download_n_results(query, n)
1480                 return
1481             except ValueError: # parsing prefix as integer fails
1482                 self._download_n_results(query, 1)
1483                 return
1484
1485     def _download_n_results(self, query, n):
1486         """Downloads a specified number of results for a query"""
1487
1488         video_ids = []
1489         pagenum = 0
1490         limit = n
1491
1492         while (50 * pagenum) < limit:
1493             self.report_download_page(query, pagenum+1)
1494             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1495             request = compat_urllib_request.Request(result_url)
1496             try:
1497                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1498             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1500                 return
1501             api_response = json.loads(data)['data']
1502
1503             if not 'items' in api_response:
1504                 self._downloader.trouble(u'[youtube] No video results')
1505                 return
1506
1507             new_ids = list(video['id'] for video in api_response['items'])
1508             video_ids += new_ids
1509
1510             limit = min(n, api_response['totalItems'])
1511             pagenum += 1
1512
1513         if len(video_ids) > n:
1514             video_ids = video_ids[:n]
1515         for id in video_ids:
1516             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1517         return
1518
1519
1520 class GoogleSearchIE(InfoExtractor):
1521     """Information Extractor for Google Video search queries."""
1522     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1523     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1524     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1525     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1526     _max_google_results = 1000
1527     IE_NAME = u'video.google:search'
1528
1529     def __init__(self, downloader=None):
1530         InfoExtractor.__init__(self, downloader)
1531
1532     def report_download_page(self, query, pagenum):
1533         """Report attempt to download playlist page with given number."""
1534         query = query.decode(preferredencoding())
1535         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1536
1537     def _real_extract(self, query):
1538         mobj = re.match(self._VALID_URL, query)
1539         if mobj is None:
1540             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1541             return
1542
1543         prefix, query = query.split(':')
1544         prefix = prefix[8:]
1545         query = query.encode('utf-8')
1546         if prefix == '':
1547             self._download_n_results(query, 1)
1548             return
1549         elif prefix == 'all':
1550             self._download_n_results(query, self._max_google_results)
1551             return
1552         else:
1553             try:
1554                 n = int(prefix)
1555                 if n <= 0:
1556                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1557                     return
1558                 elif n > self._max_google_results:
1559                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1560                     n = self._max_google_results
1561                 self._download_n_results(query, n)
1562                 return
1563             except ValueError: # parsing prefix as integer fails
1564                 self._download_n_results(query, 1)
1565                 return
1566
1567     def _download_n_results(self, query, n):
1568         """Downloads a specified number of results for a query"""
1569
1570         video_ids = []
1571         pagenum = 0
1572
1573         while True:
1574             self.report_download_page(query, pagenum)
1575             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1576             request = compat_urllib_request.Request(result_url)
1577             try:
1578                 page = compat_urllib_request.urlopen(request).read()
1579             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1580                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1581                 return
1582
1583             # Extract video identifiers
1584             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585                 video_id = mobj.group(1)
1586                 if video_id not in video_ids:
1587                     video_ids.append(video_id)
1588                     if len(video_ids) == n:
1589                         # Specified n videos reached
1590                         for id in video_ids:
1591                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1592                         return
1593
1594             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595                 for id in video_ids:
1596                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1597                 return
1598
1599             pagenum = pagenum + 1
1600
1601
1602 class YahooSearchIE(InfoExtractor):
1603     """Information Extractor for Yahoo! Video search queries."""
1604
1605     _WORKING = False
1606     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1607     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1608     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1609     _MORE_PAGES_INDICATOR = r'\s*Next'
1610     _max_yahoo_results = 1000
1611     IE_NAME = u'video.yahoo:search'
1612
1613     def __init__(self, downloader=None):
1614         InfoExtractor.__init__(self, downloader)
1615
1616     def report_download_page(self, query, pagenum):
1617         """Report attempt to download playlist page with given number."""
1618         query = query.decode(preferredencoding())
1619         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1620
1621     def _real_extract(self, query):
1622         mobj = re.match(self._VALID_URL, query)
1623         if mobj is None:
1624             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1625             return
1626
1627         prefix, query = query.split(':')
1628         prefix = prefix[8:]
1629         query = query.encode('utf-8')
1630         if prefix == '':
1631             self._download_n_results(query, 1)
1632             return
1633         elif prefix == 'all':
1634             self._download_n_results(query, self._max_yahoo_results)
1635             return
1636         else:
1637             try:
1638                 n = int(prefix)
1639                 if n <= 0:
1640                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1641                     return
1642                 elif n > self._max_yahoo_results:
1643                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1644                     n = self._max_yahoo_results
1645                 self._download_n_results(query, n)
1646                 return
1647             except ValueError: # parsing prefix as integer fails
1648                 self._download_n_results(query, 1)
1649                 return
1650
1651     def _download_n_results(self, query, n):
1652         """Downloads a specified number of results for a query"""
1653
1654         video_ids = []
1655         already_seen = set()
1656         pagenum = 1
1657
1658         while True:
1659             self.report_download_page(query, pagenum)
1660             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1661             request = compat_urllib_request.Request(result_url)
1662             try:
1663                 page = compat_urllib_request.urlopen(request).read()
1664             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1665                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1666                 return
1667
1668             # Extract video identifiers
1669             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1670                 video_id = mobj.group(1)
1671                 if video_id not in already_seen:
1672                     video_ids.append(video_id)
1673                     already_seen.add(video_id)
1674                     if len(video_ids) == n:
1675                         # Specified n videos reached
1676                         for id in video_ids:
1677                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1678                         return
1679
1680             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1681                 for id in video_ids:
1682                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1683                 return
1684
1685             pagenum = pagenum + 1
1686
1687
1688 class YoutubePlaylistIE(InfoExtractor):
1689     """Information Extractor for YouTube playlists."""
1690
1691     _VALID_URL = r"""(?:
1692                         (?:https?://)?
1693                         (?:\w+\.)?
1694                         youtube\.com/
1695                         (?:
1696                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1697                            \? (?:.*?&)*? (?:p|a|list)=
1698                         |  user/.*?/user/
1699                         |  p/
1700                         |  user/.*?#[pg]/c/
1701                         )
1702                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1703                         .*
1704                      |
1705                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1706                      )"""
1707     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1708     _MAX_RESULTS = 50
1709     IE_NAME = u'youtube:playlist'
1710
1711     def __init__(self, downloader=None):
1712         InfoExtractor.__init__(self, downloader)
1713
1714     @classmethod
1715     def suitable(cls, url):
1716         """Receives a URL and returns True if suitable for this IE."""
1717         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1718
1719     def report_download_page(self, playlist_id, pagenum):
1720         """Report attempt to download playlist page with given number."""
1721         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1722
1723     def _real_extract(self, url):
1724         # Extract playlist id
1725         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1726         if mobj is None:
1727             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1728             return
1729
1730         # Download playlist videos from API
1731         playlist_id = mobj.group(1) or mobj.group(2)
1732         page_num = 1
1733         videos = []
1734
1735         while True:
1736             self.report_download_page(playlist_id, page_num)
1737
1738             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1739             try:
1740                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1741             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1742                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1743                 return
1744
1745             try:
1746                 response = json.loads(page)
1747             except ValueError as err:
1748                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1749                 return
1750
1751             if not 'feed' in response or not 'entry' in response['feed']:
1752                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1753                 return
1754             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1755                         for entry in response['feed']['entry']
1756                         if 'content' in entry ]
1757
1758             if len(response['feed']['entry']) < self._MAX_RESULTS:
1759                 break
1760             page_num += 1
1761
1762         videos = [v[1] for v in sorted(videos)]
1763         total = len(videos)
1764
1765         playliststart = self._downloader.params.get('playliststart', 1) - 1
1766         playlistend = self._downloader.params.get('playlistend', -1)
1767         if playlistend == -1:
1768             videos = videos[playliststart:]
1769         else:
1770             videos = videos[playliststart:playlistend]
1771
1772         if len(videos) == total:
1773             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1774         else:
1775             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1776
1777         url_results = [self.url_result(url) for url in videos]
1778         return [self.playlist_result(url_results)]
1779
1780
1781 class YoutubeChannelIE(InfoExtractor):
1782     """Information Extractor for YouTube channels."""
1783
1784     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1785     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1786     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1787     IE_NAME = u'youtube:channel'
1788
1789     def report_download_page(self, channel_id, pagenum):
1790         """Report attempt to download channel page with given number."""
1791         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1792
1793     def _real_extract(self, url):
1794         # Extract channel id
1795         mobj = re.match(self._VALID_URL, url)
1796         if mobj is None:
1797             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1798             return
1799
1800         # Download channel pages
1801         channel_id = mobj.group(1)
1802         video_ids = []
1803         pagenum = 1
1804
1805         while True:
1806             self.report_download_page(channel_id, pagenum)
1807             url = self._TEMPLATE_URL % (channel_id, pagenum)
1808             request = compat_urllib_request.Request(url)
1809             try:
1810                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1811             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1812                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1813                 return
1814
1815             # Extract video identifiers
1816             ids_in_page = []
1817             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1818                 if mobj.group(1) not in ids_in_page:
1819                     ids_in_page.append(mobj.group(1))
1820             video_ids.extend(ids_in_page)
1821
1822             if self._MORE_PAGES_INDICATOR not in page:
1823                 break
1824             pagenum = pagenum + 1
1825
1826         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1827
1828         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1829         url_entries = [self.url_result(url) for url in urls]
1830         return [self.playlist_result(url_entries)]
1831
1832
1833 class YoutubeUserIE(InfoExtractor):
1834     """Information Extractor for YouTube users."""
1835
1836     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1837     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1838     _GDATA_PAGE_SIZE = 50
1839     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1840     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1841     IE_NAME = u'youtube:user'
1842
1843     def __init__(self, downloader=None):
1844         InfoExtractor.__init__(self, downloader)
1845
1846     def report_download_page(self, username, start_index):
1847         """Report attempt to download user page."""
1848         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1849                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1850
1851     def _real_extract(self, url):
1852         # Extract username
1853         mobj = re.match(self._VALID_URL, url)
1854         if mobj is None:
1855             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1856             return
1857
1858         username = mobj.group(1)
1859
1860         # Download video ids using YouTube Data API. Result size per
1861         # query is limited (currently to 50 videos) so we need to query
1862         # page by page until there are no video ids - it means we got
1863         # all of them.
1864
1865         video_ids = []
1866         pagenum = 0
1867
1868         while True:
1869             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1870             self.report_download_page(username, start_index)
1871
1872             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1873
1874             try:
1875                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1876             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1877                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1878                 return
1879
1880             # Extract video identifiers
1881             ids_in_page = []
1882
1883             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1884                 if mobj.group(1) not in ids_in_page:
1885                     ids_in_page.append(mobj.group(1))
1886
1887             video_ids.extend(ids_in_page)
1888
1889             # A little optimization - if current page is not
1890             # "full", ie. does not contain PAGE_SIZE video ids then
1891             # we can assume that this page is the last one - there
1892             # are no more ids on further pages - no need to query
1893             # again.
1894
1895             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1896                 break
1897
1898             pagenum += 1
1899
1900         all_ids_count = len(video_ids)
1901         playliststart = self._downloader.params.get('playliststart', 1) - 1
1902         playlistend = self._downloader.params.get('playlistend', -1)
1903
1904         if playlistend == -1:
1905             video_ids = video_ids[playliststart:]
1906         else:
1907             video_ids = video_ids[playliststart:playlistend]
1908
1909         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1910                 (username, all_ids_count, len(video_ids)))
1911
1912         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1913         url_results = [self.url_result(url) for url in urls]
1914         return [self.playlist_result(url_results)]
1915
1916
1917 class BlipTVUserIE(InfoExtractor):
1918     """Information Extractor for blip.tv users."""
1919
1920     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1921     _PAGE_SIZE = 12
1922     IE_NAME = u'blip.tv:user'
1923
1924     def __init__(self, downloader=None):
1925         InfoExtractor.__init__(self, downloader)
1926
1927     def report_download_page(self, username, pagenum):
1928         """Report attempt to download user page."""
1929         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1930                 (self.IE_NAME, username, pagenum))
1931
1932     def _real_extract(self, url):
1933         # Extract username
1934         mobj = re.match(self._VALID_URL, url)
1935         if mobj is None:
1936             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1937             return
1938
1939         username = mobj.group(1)
1940
1941         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1942
1943         request = compat_urllib_request.Request(url)
1944
1945         try:
1946             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947             mobj = re.search(r'data-users-id="([^"]+)"', page)
1948             page_base = page_base % mobj.group(1)
1949         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1950             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1951             return
1952
1953
1954         # Download video ids using BlipTV Ajax calls. Result size per
1955         # query is limited (currently to 12 videos) so we need to query
1956         # page by page until there are no video ids - it means we got
1957         # all of them.
1958
1959         video_ids = []
1960         pagenum = 1
1961
1962         while True:
1963             self.report_download_page(username, pagenum)
1964             url = page_base + "&page=" + str(pagenum)
1965             request = compat_urllib_request.Request( url )
1966             try:
1967                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1969                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1970                 return
1971
1972             # Extract video identifiers
1973             ids_in_page = []
1974
1975             for mobj in re.finditer(r'href="/([^"]+)"', page):
1976                 if mobj.group(1) not in ids_in_page:
1977                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1978
1979             video_ids.extend(ids_in_page)
1980
1981             # A little optimization - if current page is not
1982             # "full", ie. does not contain PAGE_SIZE video ids then
1983             # we can assume that this page is the last one - there
1984             # are no more ids on further pages - no need to query
1985             # again.
1986
1987             if len(ids_in_page) < self._PAGE_SIZE:
1988                 break
1989
1990             pagenum += 1
1991
1992         all_ids_count = len(video_ids)
1993         playliststart = self._downloader.params.get('playliststart', 1) - 1
1994         playlistend = self._downloader.params.get('playlistend', -1)
1995
1996         if playlistend == -1:
1997             video_ids = video_ids[playliststart:]
1998         else:
1999             video_ids = video_ids[playliststart:playlistend]
2000
2001         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2002                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2003
2004         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2005         url_entries = [self.url_result(url) for url in urls]
2006         return [self.playlist_result(url_entries)]
2007
2008
2009 class DepositFilesIE(InfoExtractor):
2010     """Information extractor for depositfiles.com"""
2011
2012     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2013
2014     def report_download_webpage(self, file_id):
2015         """Report webpage download."""
2016         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2017
2018     def report_extraction(self, file_id):
2019         """Report information extraction."""
2020         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2021
2022     def _real_extract(self, url):
2023         file_id = url.split('/')[-1]
2024         # Rebuild url in english locale
2025         url = 'http://depositfiles.com/en/files/' + file_id
2026
2027         # Retrieve file webpage with 'Free download' button pressed
2028         free_download_indication = { 'gateway_result' : '1' }
2029         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2030         try:
2031             self.report_download_webpage(file_id)
2032             webpage = compat_urllib_request.urlopen(request).read()
2033         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2035             return
2036
2037         # Search for the real file URL
2038         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2039         if (mobj is None) or (mobj.group(1) is None):
2040             # Try to figure out reason of the error.
2041             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2042             if (mobj is not None) and (mobj.group(1) is not None):
2043                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2044                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2045             else:
2046                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2047             return
2048
2049         file_url = mobj.group(1)
2050         file_extension = os.path.splitext(file_url)[1][1:]
2051
2052         # Search for file title
2053         mobj = re.search(r'<b title="(.*?)">', webpage)
2054         if mobj is None:
2055             self._downloader.trouble(u'ERROR: unable to extract title')
2056             return
2057         file_title = mobj.group(1).decode('utf-8')
2058
2059         return [{
2060             'id':       file_id.decode('utf-8'),
2061             'url':      file_url.decode('utf-8'),
2062             'uploader': None,
2063             'upload_date':  None,
2064             'title':    file_title,
2065             'ext':      file_extension.decode('utf-8'),
2066         }]
2067
2068
2069 class FacebookIE(InfoExtractor):
2070     """Information Extractor for Facebook"""
2071
2072     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2073     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2074     _NETRC_MACHINE = 'facebook'
2075     IE_NAME = u'facebook'
2076
2077     def report_login(self):
2078         """Report attempt to log in."""
2079         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2080
2081     def _real_initialize(self):
2082         if self._downloader is None:
2083             return
2084
2085         useremail = None
2086         password = None
2087         downloader_params = self._downloader.params
2088
2089         # Attempt to use provided username and password or .netrc data
2090         if downloader_params.get('username', None) is not None:
2091             useremail = downloader_params['username']
2092             password = downloader_params['password']
2093         elif downloader_params.get('usenetrc', False):
2094             try:
2095                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2096                 if info is not None:
2097                     useremail = info[0]
2098                     password = info[2]
2099                 else:
2100                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2101             except (IOError, netrc.NetrcParseError) as err:
2102                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2103                 return
2104
2105         if useremail is None:
2106             return
2107
2108         # Log in
2109         login_form = {
2110             'email': useremail,
2111             'pass': password,
2112             'login': 'Log+In'
2113             }
2114         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2115         try:
2116             self.report_login()
2117             login_results = compat_urllib_request.urlopen(request).read()
2118             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2119                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2120                 return
2121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2122             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2123             return
2124
2125     def _real_extract(self, url):
2126         mobj = re.match(self._VALID_URL, url)
2127         if mobj is None:
2128             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2129             return
2130         video_id = mobj.group('ID')
2131
2132         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2133         webpage = self._download_webpage(url, video_id)
2134
2135         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2136         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2137         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2138         if not m:
2139             raise ExtractorError(u'Cannot parse data')
2140         data = dict(json.loads(m.group(1)))
2141         params_raw = compat_urllib_parse.unquote(data['params'])
2142         params = json.loads(params_raw)
2143         video_url = params['hd_src']
2144         if not video_url:
2145             video_url = params['sd_src']
2146         if not video_url:
2147             raise ExtractorError(u'Cannot find video URL')
2148         video_duration = int(params['video_duration'])
2149
2150         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2151         if not m:
2152             raise ExtractorError(u'Cannot find title in webpage')
2153         video_title = unescapeHTML(m.group(1))
2154
2155         info = {
2156             'id': video_id,
2157             'title': video_title,
2158             'url': video_url,
2159             'ext': 'mp4',
2160             'duration': video_duration,
2161             'thumbnail': params['thumbnail_src'],
2162         }
2163         return [info]
2164
2165
2166 class BlipTVIE(InfoExtractor):
2167     """Information extractor for blip.tv"""
2168
2169     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2170     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2171     IE_NAME = u'blip.tv'
2172
2173     def report_extraction(self, file_id):
2174         """Report information extraction."""
2175         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2176
2177     def report_direct_download(self, title):
2178         """Report information extraction."""
2179         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2180
2181     def _real_extract(self, url):
2182         mobj = re.match(self._VALID_URL, url)
2183         if mobj is None:
2184             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2185             return
2186
2187         urlp = compat_urllib_parse_urlparse(url)
2188         if urlp.path.startswith('/play/'):
2189             request = compat_urllib_request.Request(url)
2190             response = compat_urllib_request.urlopen(request)
2191             redirecturl = response.geturl()
2192             rurlp = compat_urllib_parse_urlparse(redirecturl)
2193             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2194             url = 'http://blip.tv/a/a-' + file_id
2195             return self._real_extract(url)
2196
2197
2198         if '?' in url:
2199             cchar = '&'
2200         else:
2201             cchar = '?'
2202         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2203         request = compat_urllib_request.Request(json_url)
2204         request.add_header('User-Agent', 'iTunes/10.6.1')
2205         self.report_extraction(mobj.group(1))
2206         info = None
2207         try:
2208             urlh = compat_urllib_request.urlopen(request)
2209             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2210                 basename = url.split('/')[-1]
2211                 title,ext = os.path.splitext(basename)
2212                 title = title.decode('UTF-8')
2213                 ext = ext.replace('.', '')
2214                 self.report_direct_download(title)
2215                 info = {
2216                     'id': title,
2217                     'url': url,
2218                     'uploader': None,
2219                     'upload_date': None,
2220                     'title': title,
2221                     'ext': ext,
2222                     'urlhandle': urlh
2223                 }
2224         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2225             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2226         if info is None: # Regular URL
2227             try:
2228                 json_code_bytes = urlh.read()
2229                 json_code = json_code_bytes.decode('utf-8')
2230             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2232                 return
2233
2234             try:
2235                 json_data = json.loads(json_code)
2236                 if 'Post' in json_data:
2237                     data = json_data['Post']
2238                 else:
2239                     data = json_data
2240
2241                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2242                 video_url = data['media']['url']
2243                 umobj = re.match(self._URL_EXT, video_url)
2244                 if umobj is None:
2245                     raise ValueError('Can not determine filename extension')
2246                 ext = umobj.group(1)
2247
2248                 info = {
2249                     'id': data['item_id'],
2250                     'url': video_url,
2251                     'uploader': data['display_name'],
2252                     'upload_date': upload_date,
2253                     'title': data['title'],
2254                     'ext': ext,
2255                     'format': data['media']['mimeType'],
2256                     'thumbnail': data['thumbnailUrl'],
2257                     'description': data['description'],
2258                     'player_url': data['embedUrl'],
2259                     'user_agent': 'iTunes/10.6.1',
2260                 }
2261             except (ValueError,KeyError) as err:
2262                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2263                 return
2264
2265         return [info]
2266
2267
2268 class MyVideoIE(InfoExtractor):
2269     """Information Extractor for myvideo.de."""
2270
2271     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2272     IE_NAME = u'myvideo'
2273
2274     def __init__(self, downloader=None):
2275         InfoExtractor.__init__(self, downloader)
2276
2277     def report_extraction(self, video_id):
2278         """Report information extraction."""
2279         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2280
2281     def _real_extract(self,url):
2282         mobj = re.match(self._VALID_URL, url)
2283         if mobj is None:
2284             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2285             return
2286
2287         video_id = mobj.group(1)
2288
2289         # Get video webpage
2290         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2291         webpage = self._download_webpage(webpage_url, video_id)
2292
2293         self.report_extraction(video_id)
2294         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2295                  webpage)
2296         if mobj is None:
2297             self._downloader.trouble(u'ERROR: unable to extract media URL')
2298             return
2299         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2300
2301         mobj = re.search('<title>([^<]+)</title>', webpage)
2302         if mobj is None:
2303             self._downloader.trouble(u'ERROR: unable to extract title')
2304             return
2305
2306         video_title = mobj.group(1)
2307
2308         return [{
2309             'id':       video_id,
2310             'url':      video_url,
2311             'uploader': None,
2312             'upload_date':  None,
2313             'title':    video_title,
2314             'ext':      u'flv',
2315         }]
2316
2317 class ComedyCentralIE(InfoExtractor):
2318     """Information extractor for The Daily Show and Colbert Report """
2319
2320     # urls can be abbreviations like :thedailyshow or :colbert
2321     # urls for episodes like:
2322     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2323     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2324     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2325     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2326                       |(https?://)?(www\.)?
2327                           (?P<showname>thedailyshow|colbertnation)\.com/
2328                          (full-episodes/(?P<episode>.*)|
2329                           (?P<clip>
2330                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2331                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2332                      $"""
2333
2334     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2335
2336     _video_extensions = {
2337         '3500': 'mp4',
2338         '2200': 'mp4',
2339         '1700': 'mp4',
2340         '1200': 'mp4',
2341         '750': 'mp4',
2342         '400': 'mp4',
2343     }
2344     _video_dimensions = {
2345         '3500': '1280x720',
2346         '2200': '960x540',
2347         '1700': '768x432',
2348         '1200': '640x360',
2349         '750': '512x288',
2350         '400': '384x216',
2351     }
2352
2353     @classmethod
2354     def suitable(cls, url):
2355         """Receives a URL and returns True if suitable for this IE."""
2356         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2357
2358     def report_extraction(self, episode_id):
2359         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2360
2361     def report_config_download(self, episode_id, media_id):
2362         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2363
2364     def report_index_download(self, episode_id):
2365         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2366
2367     def _print_formats(self, formats):
2368         print('Available formats:')
2369         for x in formats:
2370             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2371
2372
2373     def _real_extract(self, url):
2374         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2375         if mobj is None:
2376             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2377             return
2378
2379         if mobj.group('shortname'):
2380             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2381                 url = u'http://www.thedailyshow.com/full-episodes/'
2382             else:
2383                 url = u'http://www.colbertnation.com/full-episodes/'
2384             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2385             assert mobj is not None
2386
2387         if mobj.group('clip'):
2388             if mobj.group('showname') == 'thedailyshow':
2389                 epTitle = mobj.group('tdstitle')
2390             else:
2391                 epTitle = mobj.group('cntitle')
2392             dlNewest = False
2393         else:
2394             dlNewest = not mobj.group('episode')
2395             if dlNewest:
2396                 epTitle = mobj.group('showname')
2397             else:
2398                 epTitle = mobj.group('episode')
2399
2400         req = compat_urllib_request.Request(url)
2401         self.report_extraction(epTitle)
2402         try:
2403             htmlHandle = compat_urllib_request.urlopen(req)
2404             html = htmlHandle.read()
2405             webpage = html.decode('utf-8')
2406         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2407             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2408             return
2409         if dlNewest:
2410             url = htmlHandle.geturl()
2411             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2412             if mobj is None:
2413                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2414                 return
2415             if mobj.group('episode') == '':
2416                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2417                 return
2418             epTitle = mobj.group('episode')
2419
2420         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2421
2422         if len(mMovieParams) == 0:
2423             # The Colbert Report embeds the information in a without
2424             # a URL prefix; so extract the alternate reference
2425             # and then add the URL prefix manually.
2426
2427             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2428             if len(altMovieParams) == 0:
2429                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2430                 return
2431             else:
2432                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2433
2434         uri = mMovieParams[0][1]
2435         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2436         self.report_index_download(epTitle)
2437         try:
2438             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2439         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2441             return
2442
2443         results = []
2444
2445         idoc = xml.etree.ElementTree.fromstring(indexXml)
2446         itemEls = idoc.findall('.//item')
2447         for partNum,itemEl in enumerate(itemEls):
2448             mediaId = itemEl.findall('./guid')[0].text
2449             shortMediaId = mediaId.split(':')[-1]
2450             showId = mediaId.split(':')[-2].replace('.com', '')
2451             officialTitle = itemEl.findall('./title')[0].text
2452             officialDate = itemEl.findall('./pubDate')[0].text
2453
2454             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2455                         compat_urllib_parse.urlencode({'uri': mediaId}))
2456             configReq = compat_urllib_request.Request(configUrl)
2457             self.report_config_download(epTitle, shortMediaId)
2458             try:
2459                 configXml = compat_urllib_request.urlopen(configReq).read()
2460             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2461                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2462                 return
2463
2464             cdoc = xml.etree.ElementTree.fromstring(configXml)
2465             turls = []
2466             for rendition in cdoc.findall('.//rendition'):
2467                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2468                 turls.append(finfo)
2469
2470             if len(turls) == 0:
2471                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2472                 continue
2473
2474             if self._downloader.params.get('listformats', None):
2475                 self._print_formats([i[0] for i in turls])
2476                 return
2477
2478             # For now, just pick the highest bitrate
2479             format,rtmp_video_url = turls[-1]
2480
2481             # Get the format arg from the arg stream
2482             req_format = self._downloader.params.get('format', None)
2483
2484             # Select format if we can find one
2485             for f,v in turls:
2486                 if f == req_format:
2487                     format, rtmp_video_url = f, v
2488                     break
2489
2490             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2491             if not m:
2492                 raise ExtractorError(u'Cannot transform RTMP url')
2493             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2494             video_url = base + m.group('finalid')
2495
2496             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2497             info = {
2498                 'id': shortMediaId,
2499                 'url': video_url,
2500                 'uploader': showId,
2501                 'upload_date': officialDate,
2502                 'title': effTitle,
2503                 'ext': 'mp4',
2504                 'format': format,
2505                 'thumbnail': None,
2506                 'description': officialTitle,
2507             }
2508             results.append(info)
2509
2510         return results
2511
2512
2513 class EscapistIE(InfoExtractor):
2514     """Information extractor for The Escapist """
2515
2516     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2517     IE_NAME = u'escapist'
2518
2519     def report_extraction(self, showName):
2520         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2521
2522     def report_config_download(self, showName):
2523         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2524
2525     def _real_extract(self, url):
2526         mobj = re.match(self._VALID_URL, url)
2527         if mobj is None:
2528             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2529             return
2530         showName = mobj.group('showname')
2531         videoId = mobj.group('episode')
2532
2533         self.report_extraction(showName)
2534         try:
2535             webPage = compat_urllib_request.urlopen(url)
2536             webPageBytes = webPage.read()
2537             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2538             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2539         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2540             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2541             return
2542
2543         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2544         description = unescapeHTML(descMatch.group(1))
2545         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2546         imgUrl = unescapeHTML(imgMatch.group(1))
2547         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2548         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2549         configUrlMatch = re.search('config=(.*)$', playerUrl)
2550         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2551
2552         self.report_config_download(showName)
2553         try:
2554             configJSON = compat_urllib_request.urlopen(configUrl)
2555             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2556             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2557         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2558             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2559             return
2560
2561         # Technically, it's JavaScript, not JSON
2562         configJSON = configJSON.replace("'", '"')
2563
2564         try:
2565             config = json.loads(configJSON)
2566         except (ValueError,) as err:
2567             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2568             return
2569
2570         playlist = config['playlist']
2571         videoUrl = playlist[1]['url']
2572
2573         info = {
2574             'id': videoId,
2575             'url': videoUrl,
2576             'uploader': showName,
2577             'upload_date': None,
2578             'title': showName,
2579             'ext': 'flv',
2580             'thumbnail': imgUrl,
2581             'description': description,
2582             'player_url': playerUrl,
2583         }
2584
2585         return [info]
2586
2587 class CollegeHumorIE(InfoExtractor):
2588     """Information extractor for collegehumor.com"""
2589
2590     _WORKING = False
2591     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2592     IE_NAME = u'collegehumor'
2593
2594     def report_manifest(self, video_id):
2595         """Report information extraction."""
2596         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2597
2598     def report_extraction(self, video_id):
2599         """Report information extraction."""
2600         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2601
2602     def _real_extract(self, url):
2603         mobj = re.match(self._VALID_URL, url)
2604         if mobj is None:
2605             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2606             return
2607         video_id = mobj.group('videoid')
2608
2609         info = {
2610             'id': video_id,
2611             'uploader': None,
2612             'upload_date': None,
2613         }
2614
2615         self.report_extraction(video_id)
2616         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2617         try:
2618             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2620             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2621             return
2622
2623         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2624         try:
2625             videoNode = mdoc.findall('./video')[0]
2626             info['description'] = videoNode.findall('./description')[0].text
2627             info['title'] = videoNode.findall('./caption')[0].text
2628             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2629             manifest_url = videoNode.findall('./file')[0].text
2630         except IndexError:
2631             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2632             return
2633
2634         manifest_url += '?hdcore=2.10.3'
2635         self.report_manifest(video_id)
2636         try:
2637             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2638         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2639             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2640             return
2641
2642         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2643         try:
2644             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2645             node_id = media_node.attrib['url']
2646             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2647         except IndexError as err:
2648             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2649             return
2650
2651         url_pr = compat_urllib_parse_urlparse(manifest_url)
2652         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2653
2654         info['url'] = url
2655         info['ext'] = 'f4f'
2656         return [info]
2657
2658
2659 class XVideosIE(InfoExtractor):
2660     """Information extractor for xvideos.com"""
2661
2662     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2663     IE_NAME = u'xvideos'
2664
2665     def report_extraction(self, video_id):
2666         """Report information extraction."""
2667         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2668
2669     def _real_extract(self, url):
2670         mobj = re.match(self._VALID_URL, url)
2671         if mobj is None:
2672             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2673             return
2674         video_id = mobj.group(1)
2675
2676         webpage = self._download_webpage(url, video_id)
2677
2678         self.report_extraction(video_id)
2679
2680
2681         # Extract video URL
2682         mobj = re.search(r'flv_url=(.+?)&', webpage)
2683         if mobj is None:
2684             self._downloader.trouble(u'ERROR: unable to extract video url')
2685             return
2686         video_url = compat_urllib_parse.unquote(mobj.group(1))
2687
2688
2689         # Extract title
2690         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2691         if mobj is None:
2692             self._downloader.trouble(u'ERROR: unable to extract video title')
2693             return
2694         video_title = mobj.group(1)
2695
2696
2697         # Extract video thumbnail
2698         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2699         if mobj is None:
2700             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2701             return
2702         video_thumbnail = mobj.group(0)
2703
2704         info = {
2705             'id': video_id,
2706             'url': video_url,
2707             'uploader': None,
2708             'upload_date': None,
2709             'title': video_title,
2710             'ext': 'flv',
2711             'thumbnail': video_thumbnail,
2712             'description': None,
2713         }
2714
2715         return [info]
2716
2717
2718 class SoundcloudIE(InfoExtractor):
2719     """Information extractor for soundcloud.com
2720        To access the media, the uid of the song and a stream token
2721        must be extracted from the page source and the script must make
2722        a request to media.soundcloud.com/crossdomain.xml. Then
2723        the media can be grabbed by requesting from an url composed
2724        of the stream token and uid
2725      """
2726
2727     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2728     IE_NAME = u'soundcloud'
2729
2730     def __init__(self, downloader=None):
2731         InfoExtractor.__init__(self, downloader)
2732
2733     def report_resolve(self, video_id):
2734         """Report information extraction."""
2735         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2736
2737     def report_extraction(self, video_id):
2738         """Report information extraction."""
2739         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2740
2741     def _real_extract(self, url):
2742         mobj = re.match(self._VALID_URL, url)
2743         if mobj is None:
2744             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2745             return
2746
2747         # extract uploader (which is in the url)
2748         uploader = mobj.group(1)
2749         # extract simple title (uploader + slug of song title)
2750         slug_title =  mobj.group(2)
2751         simple_title = uploader + u'-' + slug_title
2752
2753         self.report_resolve('%s/%s' % (uploader, slug_title))
2754
2755         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2756         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2757         request = compat_urllib_request.Request(resolv_url)
2758         try:
2759             info_json_bytes = compat_urllib_request.urlopen(request).read()
2760             info_json = info_json_bytes.decode('utf-8')
2761         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2762             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2763             return
2764
2765         info = json.loads(info_json)
2766         video_id = info['id']
2767         self.report_extraction('%s/%s' % (uploader, slug_title))
2768
2769         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2770         request = compat_urllib_request.Request(streams_url)
2771         try:
2772             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2773             stream_json = stream_json_bytes.decode('utf-8')
2774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2775             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2776             return
2777
2778         streams = json.loads(stream_json)
2779         mediaURL = streams['http_mp3_128_url']
2780
2781         return [{
2782             'id':       info['id'],
2783             'url':      mediaURL,
2784             'uploader': info['user']['username'],
2785             'upload_date':  info['created_at'],
2786             'title':    info['title'],
2787             'ext':      u'mp3',
2788             'description': info['description'],
2789         }]
2790
2791
2792 class InfoQIE(InfoExtractor):
2793     """Information extractor for infoq.com"""
2794     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2795
2796     def report_extraction(self, video_id):
2797         """Report information extraction."""
2798         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2799
2800     def _real_extract(self, url):
2801         mobj = re.match(self._VALID_URL, url)
2802         if mobj is None:
2803             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2804             return
2805
2806         webpage = self._download_webpage(url, video_id=url)
2807         self.report_extraction(url)
2808
2809         # Extract video URL
2810         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2811         if mobj is None:
2812             self._downloader.trouble(u'ERROR: unable to extract video url')
2813             return
2814         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2815         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2816
2817         # Extract title
2818         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2819         if mobj is None:
2820             self._downloader.trouble(u'ERROR: unable to extract video title')
2821             return
2822         video_title = mobj.group(1)
2823
2824         # Extract description
2825         video_description = u'No description available.'
2826         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2827         if mobj is not None:
2828             video_description = mobj.group(1)
2829
2830         video_filename = video_url.split('/')[-1]
2831         video_id, extension = video_filename.split('.')
2832
2833         info = {
2834             'id': video_id,
2835             'url': video_url,
2836             'uploader': None,
2837             'upload_date': None,
2838             'title': video_title,
2839             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2840             'thumbnail': None,
2841             'description': video_description,
2842         }
2843
2844         return [info]
2845
2846 class MixcloudIE(InfoExtractor):
2847     """Information extractor for www.mixcloud.com"""
2848
2849     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2850     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2851     IE_NAME = u'mixcloud'
2852
2853     def __init__(self, downloader=None):
2854         InfoExtractor.__init__(self, downloader)
2855
2856     def report_download_json(self, file_id):
2857         """Report JSON download."""
2858         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2859
2860     def report_extraction(self, file_id):
2861         """Report information extraction."""
2862         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2863
2864     def get_urls(self, jsonData, fmt, bitrate='best'):
2865         """Get urls from 'audio_formats' section in json"""
2866         file_url = None
2867         try:
2868             bitrate_list = jsonData[fmt]
2869             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2870                 bitrate = max(bitrate_list) # select highest
2871
2872             url_list = jsonData[fmt][bitrate]
2873         except TypeError: # we have no bitrate info.
2874             url_list = jsonData[fmt]
2875         return url_list
2876
2877     def check_urls(self, url_list):
2878         """Returns 1st active url from list"""
2879         for url in url_list:
2880             try:
2881                 compat_urllib_request.urlopen(url)
2882                 return url
2883             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884                 url = None
2885
2886         return None
2887
2888     def _print_formats(self, formats):
2889         print('Available formats:')
2890         for fmt in formats.keys():
2891             for b in formats[fmt]:
2892                 try:
2893                     ext = formats[fmt][b][0]
2894                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2895                 except TypeError: # we have no bitrate info
2896                     ext = formats[fmt][0]
2897                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2898                     break
2899
2900     def _real_extract(self, url):
2901         mobj = re.match(self._VALID_URL, url)
2902         if mobj is None:
2903             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2904             return
2905         # extract uploader & filename from url
2906         uploader = mobj.group(1).decode('utf-8')
2907         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2908
2909         # construct API request
2910         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2911         # retrieve .json file with links to files
2912         request = compat_urllib_request.Request(file_url)
2913         try:
2914             self.report_download_json(file_url)
2915             jsonData = compat_urllib_request.urlopen(request).read()
2916         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2917             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2918             return
2919
2920         # parse JSON
2921         json_data = json.loads(jsonData)
2922         player_url = json_data['player_swf_url']
2923         formats = dict(json_data['audio_formats'])
2924
2925         req_format = self._downloader.params.get('format', None)
2926         bitrate = None
2927
2928         if self._downloader.params.get('listformats', None):
2929             self._print_formats(formats)
2930             return
2931
2932         if req_format is None or req_format == 'best':
2933             for format_param in formats.keys():
2934                 url_list = self.get_urls(formats, format_param)
2935                 # check urls
2936                 file_url = self.check_urls(url_list)
2937                 if file_url is not None:
2938                     break # got it!
2939         else:
2940             if req_format not in formats:
2941                 self._downloader.trouble(u'ERROR: format is not available')
2942                 return
2943
2944             url_list = self.get_urls(formats, req_format)
2945             file_url = self.check_urls(url_list)
2946             format_param = req_format
2947
2948         return [{
2949             'id': file_id.decode('utf-8'),
2950             'url': file_url.decode('utf-8'),
2951             'uploader': uploader.decode('utf-8'),
2952             'upload_date': None,
2953             'title': json_data['name'],
2954             'ext': file_url.split('.')[-1].decode('utf-8'),
2955             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2956             'thumbnail': json_data['thumbnail_url'],
2957             'description': json_data['description'],
2958             'player_url': player_url.decode('utf-8'),
2959         }]
2960
2961 class StanfordOpenClassroomIE(InfoExtractor):
2962     """Information extractor for Stanford's Open ClassRoom"""
2963
2964     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2965     IE_NAME = u'stanfordoc'
2966
2967     def report_download_webpage(self, objid):
2968         """Report information extraction."""
2969         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2970
2971     def report_extraction(self, video_id):
2972         """Report information extraction."""
2973         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2974
2975     def _real_extract(self, url):
2976         mobj = re.match(self._VALID_URL, url)
2977         if mobj is None:
2978             raise ExtractorError(u'Invalid URL: %s' % url)
2979
2980         if mobj.group('course') and mobj.group('video'): # A specific video
2981             course = mobj.group('course')
2982             video = mobj.group('video')
2983             info = {
2984                 'id': course + '_' + video,
2985                 'uploader': None,
2986                 'upload_date': None,
2987             }
2988
2989             self.report_extraction(info['id'])
2990             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2991             xmlUrl = baseUrl + video + '.xml'
2992             try:
2993                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2994             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2995                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2996                 return
2997             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2998             try:
2999                 info['title'] = mdoc.findall('./title')[0].text
3000                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3001             except IndexError:
3002                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3003                 return
3004             info['ext'] = info['url'].rpartition('.')[2]
3005             return [info]
3006         elif mobj.group('course'): # A course page
3007             course = mobj.group('course')
3008             info = {
3009                 'id': course,
3010                 'type': 'playlist',
3011                 'uploader': None,
3012                 'upload_date': None,
3013             }
3014
3015             coursepage = self._download_webpage(url, info['id'],
3016                                         note='Downloading course info page',
3017                                         errnote='Unable to download course info page')
3018
3019             m = re.search('<h1>([^<]+)</h1>', coursepage)
3020             if m:
3021                 info['title'] = unescapeHTML(m.group(1))
3022             else:
3023                 info['title'] = info['id']
3024
3025             m = re.search('<description>([^<]+)</description>', coursepage)
3026             if m:
3027                 info['description'] = unescapeHTML(m.group(1))
3028
3029             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3030             info['list'] = [
3031                 {
3032                     'type': 'reference',
3033                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3034                 }
3035                     for vpage in links]
3036             results = []
3037             for entry in info['list']:
3038                 assert entry['type'] == 'reference'
3039                 results += self.extract(entry['url'])
3040             return results
3041         else: # Root page
3042             info = {
3043                 'id': 'Stanford OpenClassroom',
3044                 'type': 'playlist',
3045                 'uploader': None,
3046                 'upload_date': None,
3047             }
3048
3049             self.report_download_webpage(info['id'])
3050             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3051             try:
3052                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3053             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3054                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3055                 return
3056
3057             info['title'] = info['id']
3058
3059             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3060             info['list'] = [
3061                 {
3062                     'type': 'reference',
3063                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3064                 }
3065                     for cpage in links]
3066
3067             results = []
3068             for entry in info['list']:
3069                 assert entry['type'] == 'reference'
3070                 results += self.extract(entry['url'])
3071             return results
3072
3073 class MTVIE(InfoExtractor):
3074     """Information extractor for MTV.com"""
3075
3076     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3077     IE_NAME = u'mtv'
3078
3079     def report_extraction(self, video_id):
3080         """Report information extraction."""
3081         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3082
3083     def _real_extract(self, url):
3084         mobj = re.match(self._VALID_URL, url)
3085         if mobj is None:
3086             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3087             return
3088         if not mobj.group('proto'):
3089             url = 'http://' + url
3090         video_id = mobj.group('videoid')
3091
3092         webpage = self._download_webpage(url, video_id)
3093
3094         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3095         if mobj is None:
3096             self._downloader.trouble(u'ERROR: unable to extract song name')
3097             return
3098         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3099         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3100         if mobj is None:
3101             self._downloader.trouble(u'ERROR: unable to extract performer')
3102             return
3103         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3104         video_title = performer + ' - ' + song_name
3105
3106         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3107         if mobj is None:
3108             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3109             return
3110         mtvn_uri = mobj.group(1)
3111
3112         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3113         if mobj is None:
3114             self._downloader.trouble(u'ERROR: unable to extract content id')
3115             return
3116         content_id = mobj.group(1)
3117
3118         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3119         self.report_extraction(video_id)
3120         request = compat_urllib_request.Request(videogen_url)
3121         try:
3122             metadataXml = compat_urllib_request.urlopen(request).read()
3123         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3124             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3125             return
3126
3127         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3128         renditions = mdoc.findall('.//rendition')
3129
3130         # For now, always pick the highest quality.
3131         rendition = renditions[-1]
3132
3133         try:
3134             _,_,ext = rendition.attrib['type'].partition('/')
3135             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3136             video_url = rendition.find('./src').text
3137         except KeyError:
3138             self._downloader.trouble('Invalid rendition field.')
3139             return
3140
3141         info = {
3142             'id': video_id,
3143             'url': video_url,
3144             'uploader': performer,
3145             'upload_date': None,
3146             'title': video_title,
3147             'ext': ext,
3148             'format': format,
3149         }
3150
3151         return [info]
3152
3153
3154 class YoukuIE(InfoExtractor):
3155     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3156
3157     def report_download_webpage(self, file_id):
3158         """Report webpage download."""
3159         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3160
3161     def report_extraction(self, file_id):
3162         """Report information extraction."""
3163         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3164
3165     def _gen_sid(self):
3166         nowTime = int(time.time() * 1000)
3167         random1 = random.randint(1000,1998)
3168         random2 = random.randint(1000,9999)
3169
3170         return "%d%d%d" %(nowTime,random1,random2)
3171
3172     def _get_file_ID_mix_string(self, seed):
3173         mixed = []
3174         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3175         seed = float(seed)
3176         for i in range(len(source)):
3177             seed  =  (seed * 211 + 30031 ) % 65536
3178             index  =  math.floor(seed / 65536 * len(source) )
3179             mixed.append(source[int(index)])
3180             source.remove(source[int(index)])
3181         #return ''.join(mixed)
3182         return mixed
3183
3184     def _get_file_id(self, fileId, seed):
3185         mixed = self._get_file_ID_mix_string(seed)
3186         ids = fileId.split('*')
3187         realId = []
3188         for ch in ids:
3189             if ch:
3190                 realId.append(mixed[int(ch)])
3191         return ''.join(realId)
3192
3193     def _real_extract(self, url):
3194         mobj = re.match(self._VALID_URL, url)
3195         if mobj is None:
3196             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3197             return
3198         video_id = mobj.group('ID')
3199
3200         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3201
3202         request = compat_urllib_request.Request(info_url, None, std_headers)
3203         try:
3204             self.report_download_webpage(video_id)
3205             jsondata = compat_urllib_request.urlopen(request).read()
3206         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3207             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3208             return
3209
3210         self.report_extraction(video_id)
3211         try:
3212             jsonstr = jsondata.decode('utf-8')
3213             config = json.loads(jsonstr)
3214
3215             video_title =  config['data'][0]['title']
3216             seed = config['data'][0]['seed']
3217
3218             format = self._downloader.params.get('format', None)
3219             supported_format = list(config['data'][0]['streamfileids'].keys())
3220
3221             if format is None or format == 'best':
3222                 if 'hd2' in supported_format:
3223                     format = 'hd2'
3224                 else:
3225                     format = 'flv'
3226                 ext = u'flv'
3227             elif format == 'worst':
3228                 format = 'mp4'
3229                 ext = u'mp4'
3230             else:
3231                 format = 'flv'
3232                 ext = u'flv'
3233
3234
3235             fileid = config['data'][0]['streamfileids'][format]
3236             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3237         except (UnicodeDecodeError, ValueError, KeyError):
3238             self._downloader.trouble(u'ERROR: unable to extract info section')
3239             return
3240
3241         files_info=[]
3242         sid = self._gen_sid()
3243         fileid = self._get_file_id(fileid, seed)
3244
3245         #column 8,9 of fileid represent the segment number
3246         #fileid[7:9] should be changed
3247         for index, key in enumerate(keys):
3248
3249             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3250             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3251
3252             info = {
3253                 'id': '%s_part%02d' % (video_id, index),
3254                 'url': download_url,
3255                 'uploader': None,
3256                 'upload_date': None,
3257                 'title': video_title,
3258                 'ext': ext,
3259             }
3260             files_info.append(info)
3261
3262         return files_info
3263
3264
3265 class XNXXIE(InfoExtractor):
3266     """Information extractor for xnxx.com"""
3267
3268     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3269     IE_NAME = u'xnxx'
3270     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3271     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3272     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3273
3274     def report_webpage(self, video_id):
3275         """Report information extraction"""
3276         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3277
3278     def report_extraction(self, video_id):
3279         """Report information extraction"""
3280         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3281
3282     def _real_extract(self, url):
3283         mobj = re.match(self._VALID_URL, url)
3284         if mobj is None:
3285             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3286             return
3287         video_id = mobj.group(1)
3288
3289         self.report_webpage(video_id)
3290
3291         # Get webpage content
3292         try:
3293             webpage_bytes = compat_urllib_request.urlopen(url).read()
3294             webpage = webpage_bytes.decode('utf-8')
3295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3296             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3297             return
3298
3299         result = re.search(self.VIDEO_URL_RE, webpage)
3300         if result is None:
3301             self._downloader.trouble(u'ERROR: unable to extract video url')
3302             return
3303         video_url = compat_urllib_parse.unquote(result.group(1))
3304
3305         result = re.search(self.VIDEO_TITLE_RE, webpage)
3306         if result is None:
3307             self._downloader.trouble(u'ERROR: unable to extract video title')
3308             return
3309         video_title = result.group(1)
3310
3311         result = re.search(self.VIDEO_THUMB_RE, webpage)
3312         if result is None:
3313             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3314             return
3315         video_thumbnail = result.group(1)
3316
3317         return [{
3318             'id': video_id,
3319             'url': video_url,
3320             'uploader': None,
3321             'upload_date': None,
3322             'title': video_title,
3323             'ext': 'flv',
3324             'thumbnail': video_thumbnail,
3325             'description': None,
3326         }]
3327
3328
3329 class GooglePlusIE(InfoExtractor):
3330     """Information extractor for plus.google.com."""
3331
3332     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3333     IE_NAME = u'plus.google'
3334
3335     def __init__(self, downloader=None):
3336         InfoExtractor.__init__(self, downloader)
3337
3338     def report_extract_entry(self, url):
3339         """Report downloading extry"""
3340         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3341
3342     def report_date(self, upload_date):
3343         """Report downloading extry"""
3344         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3345
3346     def report_uploader(self, uploader):
3347         """Report downloading extry"""
3348         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3349
3350     def report_title(self, video_title):
3351         """Report downloading extry"""
3352         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3353
3354     def report_extract_vid_page(self, video_page):
3355         """Report information extraction."""
3356         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3357
3358     def _real_extract(self, url):
3359         # Extract id from URL
3360         mobj = re.match(self._VALID_URL, url)
3361         if mobj is None:
3362             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3363             return
3364
3365         post_url = mobj.group(0)
3366         video_id = mobj.group(1)
3367
3368         video_extension = 'flv'
3369
3370         # Step 1, Retrieve post webpage to extract further information
3371         self.report_extract_entry(post_url)
3372         request = compat_urllib_request.Request(post_url)
3373         try:
3374             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3375         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3376             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3377             return
3378
3379         # Extract update date
3380         upload_date = None
3381         pattern = 'title="Timestamp">(.*?)</a>'
3382         mobj = re.search(pattern, webpage)
3383         if mobj:
3384             upload_date = mobj.group(1)
3385             # Convert timestring to a format suitable for filename
3386             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3387             upload_date = upload_date.strftime('%Y%m%d')
3388         self.report_date(upload_date)
3389
3390         # Extract uploader
3391         uploader = None
3392         pattern = r'rel\="author".*?>(.*?)</a>'
3393         mobj = re.search(pattern, webpage)
3394         if mobj:
3395             uploader = mobj.group(1)
3396         self.report_uploader(uploader)
3397
3398         # Extract title
3399         # Get the first line for title
3400         video_title = u'NA'
3401         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3402         mobj = re.search(pattern, webpage)
3403         if mobj:
3404             video_title = mobj.group(1)
3405         self.report_title(video_title)
3406
3407         # Step 2, Stimulate clicking the image box to launch video
3408         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3409         mobj = re.search(pattern, webpage)
3410         if mobj is None:
3411             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3412
3413         video_page = mobj.group(1)
3414         request = compat_urllib_request.Request(video_page)
3415         try:
3416             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3417         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3418             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3419             return
3420         self.report_extract_vid_page(video_page)
3421
3422
3423         # Extract video links on video page
3424         """Extract video links of all sizes"""
3425         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3426         mobj = re.findall(pattern, webpage)
3427         if len(mobj) == 0:
3428             self._downloader.trouble(u'ERROR: unable to extract video links')
3429
3430         # Sort in resolution
3431         links = sorted(mobj)
3432
3433         # Choose the lowest of the sort, i.e. highest resolution
3434         video_url = links[-1]
3435         # Only get the url. The resolution part in the tuple has no use anymore
3436         video_url = video_url[-1]
3437         # Treat escaped \u0026 style hex
3438         try:
3439             video_url = video_url.decode("unicode_escape")
3440         except AttributeError: # Python 3
3441             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3442
3443
3444         return [{
3445             'id':       video_id,
3446             'url':      video_url,
3447             'uploader': uploader,
3448             'upload_date':  upload_date,
3449             'title':    video_title,
3450             'ext':      video_extension,
3451         }]
3452
3453 class NBAIE(InfoExtractor):
3454     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3455     IE_NAME = u'nba'
3456
3457     def _real_extract(self, url):
3458         mobj = re.match(self._VALID_URL, url)
3459         if mobj is None:
3460             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3461             return
3462
3463         video_id = mobj.group(1)
3464         if video_id.endswith('/index.html'):
3465             video_id = video_id[:-len('/index.html')]
3466
3467         webpage = self._download_webpage(url, video_id)
3468
3469         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3470         def _findProp(rexp, default=None):
3471             m = re.search(rexp, webpage)
3472             if m:
3473                 return unescapeHTML(m.group(1))
3474             else:
3475                 return default
3476
3477         shortened_video_id = video_id.rpartition('/')[2]
3478         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3479         info = {
3480             'id': shortened_video_id,
3481             'url': video_url,
3482             'ext': 'mp4',
3483             'title': title,
3484             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3485             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3486         }
3487         return [info]
3488
3489 class JustinTVIE(InfoExtractor):
3490     """Information extractor for justin.tv and twitch.tv"""
3491     # TODO: One broadcast may be split into multiple videos. The key
3492     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3493     # starts at 1 and increases. Can we treat all parts as one video?
3494
3495     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3496         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3497     _JUSTIN_PAGE_LIMIT = 100
3498     IE_NAME = u'justin.tv'
3499
3500     def report_extraction(self, file_id):
3501         """Report information extraction."""
3502         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3503
3504     def report_download_page(self, channel, offset):
3505         """Report attempt to download a single page of videos."""
3506         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3507                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3508
3509     # Return count of items, list of *valid* items
3510     def _parse_page(self, url):
3511         try:
3512             urlh = compat_urllib_request.urlopen(url)
3513             webpage_bytes = urlh.read()
3514             webpage = webpage_bytes.decode('utf-8', 'ignore')
3515         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3516             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3517             return
3518
3519         response = json.loads(webpage)
3520         if type(response) != list:
3521             error_text = response.get('error', 'unknown error')
3522             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3523             return
3524         info = []
3525         for clip in response:
3526             video_url = clip['video_file_url']
3527             if video_url:
3528                 video_extension = os.path.splitext(video_url)[1][1:]
3529                 video_date = re.sub('-', '', clip['start_time'][:10])
3530                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3531                 video_id = clip['id']
3532                 video_title = clip.get('title', video_id)
3533                 info.append({
3534                     'id': video_id,
3535                     'url': video_url,
3536                     'title': video_title,
3537                     'uploader': clip.get('channel_name', video_uploader_id),
3538                     'uploader_id': video_uploader_id,
3539                     'upload_date': video_date,
3540                     'ext': video_extension,
3541                 })
3542         return (len(response), info)
3543
3544     def _real_extract(self, url):
3545         mobj = re.match(self._VALID_URL, url)
3546         if mobj is None:
3547             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3548             return
3549
3550         api = 'http://api.justin.tv'
3551         video_id = mobj.group(mobj.lastindex)
3552         paged = False
3553         if mobj.lastindex == 1:
3554             paged = True
3555             api += '/channel/archives/%s.json'
3556         else:
3557             api += '/broadcast/by_archive/%s.json'
3558         api = api % (video_id,)
3559
3560         self.report_extraction(video_id)
3561
3562         info = []
3563         offset = 0
3564         limit = self._JUSTIN_PAGE_LIMIT
3565         while True:
3566             if paged:
3567                 self.report_download_page(video_id, offset)
3568             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3569             page_count, page_info = self._parse_page(page_url)
3570             info.extend(page_info)
3571             if not paged or page_count != limit:
3572                 break
3573             offset += limit
3574         return info
3575
3576 class FunnyOrDieIE(InfoExtractor):
3577     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3578
3579     def _real_extract(self, url):
3580         mobj = re.match(self._VALID_URL, url)
3581         if mobj is None:
3582             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3583             return
3584
3585         video_id = mobj.group('id')
3586         webpage = self._download_webpage(url, video_id)
3587
3588         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3589         if not m:
3590             self._downloader.trouble(u'ERROR: unable to find video information')
3591         video_url = unescapeHTML(m.group('url'))
3592
3593         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3594         if not m:
3595             self._downloader.trouble(u'Cannot find video title')
3596         title = unescapeHTML(m.group('title'))
3597
3598         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3599         if m:
3600             desc = unescapeHTML(m.group('desc'))
3601         else:
3602             desc = None
3603
3604         info = {
3605             'id': video_id,
3606             'url': video_url,
3607             'ext': 'mp4',
3608             'title': title,
3609             'description': desc,
3610         }
3611         return [info]
3612
3613 class SteamIE(InfoExtractor):
3614     _VALID_URL = r"""http://store.steampowered.com/
3615                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3616                 (?P<gameID>\d+)/?
3617                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3618                 """
3619
3620     @classmethod
3621     def suitable(cls, url):
3622         """Receives a URL and returns True if suitable for this IE."""
3623         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3624
3625     def _real_extract(self, url):
3626         m = re.match(self._VALID_URL, url, re.VERBOSE)
3627         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3628         gameID = m.group('gameID')
3629         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3630         webpage = self._download_webpage(videourl, gameID)
3631         mweb = re.finditer(urlRE, webpage)
3632         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3633         titles = re.finditer(namesRE, webpage)
3634         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3635         thumbs = re.finditer(thumbsRE, webpage)
3636         videos = []
3637         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3638             video_id = vid.group('videoID')
3639             title = vtitle.group('videoName')
3640             video_url = vid.group('videoURL')
3641             video_thumb = thumb.group('thumbnail')
3642             if not video_url:
3643                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3644             info = {
3645                 'id':video_id,
3646                 'url':video_url,
3647                 'ext': 'flv',
3648                 'title': unescapeHTML(title),
3649                 'thumbnail': video_thumb
3650                   }
3651             videos.append(info)
3652         return videos
3653
3654 class UstreamIE(InfoExtractor):
3655     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3656     IE_NAME = u'ustream'
3657
3658     def _real_extract(self, url):
3659         m = re.match(self._VALID_URL, url)
3660         video_id = m.group('videoID')
3661         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3662         webpage = self._download_webpage(url, video_id)
3663         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3664         title = m.group('title')
3665         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3666         uploader = m.group('uploader')
3667         info = {
3668                 'id':video_id,
3669                 'url':video_url,
3670                 'ext': 'flv',
3671                 'title': title,
3672                 'uploader': uploader
3673                   }
3674         return [info]
3675
3676 class RBMARadioIE(InfoExtractor):
3677     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3678
3679     def _real_extract(self, url):
3680         m = re.match(self._VALID_URL, url)
3681         video_id = m.group('videoID')
3682
3683         webpage = self._download_webpage(url, video_id)
3684         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3685         if not m:
3686             raise ExtractorError(u'Cannot find metadata')
3687         json_data = m.group(1)
3688
3689         try:
3690             data = json.loads(json_data)
3691         except ValueError as e:
3692             raise ExtractorError(u'Invalid JSON: ' + str(e))
3693
3694         video_url = data['akamai_url'] + '&cbr=256'
3695         url_parts = compat_urllib_parse_urlparse(video_url)
3696         video_ext = url_parts.path.rpartition('.')[2]
3697         info = {
3698                 'id': video_id,
3699                 'url': video_url,
3700                 'ext': video_ext,
3701                 'title': data['title'],
3702                 'description': data.get('teaser_text'),
3703                 'location': data.get('country_of_origin'),
3704                 'uploader': data.get('host', {}).get('name'),
3705                 'uploader_id': data.get('host', {}).get('slug'),
3706                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3707                 'duration': data.get('duration'),
3708         }
3709         return [info]
3710
3711
3712 class YouPornIE(InfoExtractor):
3713     """Information extractor for youporn.com."""
3714     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3715
3716     def _print_formats(self, formats):
3717         """Print all available formats"""
3718         print(u'Available formats:')
3719         print(u'ext\t\tformat')
3720         print(u'---------------------------------')
3721         for format in formats:
3722             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3723
3724     def _specific(self, req_format, formats):
3725         for x in formats:
3726             if(x["format"]==req_format):
3727                 return x
3728         return None
3729
3730     def _real_extract(self, url):
3731         mobj = re.match(self._VALID_URL, url)
3732         if mobj is None:
3733             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3734             return
3735
3736         video_id = mobj.group('videoid')
3737
3738         req = compat_urllib_request.Request(url)
3739         req.add_header('Cookie', 'age_verified=1')
3740         webpage = self._download_webpage(req, video_id)
3741
3742         # Get the video title
3743         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3744         if result is None:
3745             raise ExtractorError(u'Unable to extract video title')
3746         video_title = result.group('title').strip()
3747
3748         # Get the video date
3749         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3750         if result is None:
3751             self._downloader.report_warning(u'unable to extract video date')
3752             upload_date = None
3753         else:
3754             upload_date = result.group('date').strip()
3755
3756         # Get the video uploader
3757         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3758         if result is None:
3759             self._downloader.report_warning(u'unable to extract uploader')
3760             video_uploader = None
3761         else:
3762             video_uploader = result.group('uploader').strip()
3763             video_uploader = clean_html( video_uploader )
3764
3765         # Get all of the formats available
3766         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3767         result = re.search(DOWNLOAD_LIST_RE, webpage)
3768         if result is None:
3769             raise ExtractorError(u'Unable to extract download list')
3770         download_list_html = result.group('download_list').strip()
3771
3772         # Get all of the links from the page
3773         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3774         links = re.findall(LINK_RE, download_list_html)
3775         if(len(links) == 0):
3776             raise ExtractorError(u'ERROR: no known formats available for video')
3777
3778         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3779
3780         formats = []
3781         for link in links:
3782
3783             # A link looks like this:
3784             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3785             # A path looks like this:
3786             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3787             video_url = unescapeHTML( link )
3788             path = compat_urllib_parse_urlparse( video_url ).path
3789             extension = os.path.splitext( path )[1][1:]
3790             format = path.split('/')[4].split('_')[:2]
3791             size = format[0]
3792             bitrate = format[1]
3793             format = "-".join( format )
3794             title = u'%s-%s-%s' % (video_title, size, bitrate)
3795
3796             formats.append({
3797                 'id': video_id,
3798                 'url': video_url,
3799                 'uploader': video_uploader,
3800                 'upload_date': upload_date,
3801                 'title': title,
3802                 'ext': extension,
3803                 'format': format,
3804                 'thumbnail': None,
3805                 'description': None,
3806                 'player_url': None
3807             })
3808
3809         if self._downloader.params.get('listformats', None):
3810             self._print_formats(formats)
3811             return
3812
3813         req_format = self._downloader.params.get('format', None)
3814         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3815
3816         if req_format is None or req_format == 'best':
3817             return [formats[0]]
3818         elif req_format == 'worst':
3819             return [formats[-1]]
3820         elif req_format in ('-1', 'all'):
3821             return formats
3822         else:
3823             format = self._specific( req_format, formats )
3824             if result is None:
3825                 self._downloader.trouble(u'ERROR: requested format not available')
3826                 return
3827             return [format]
3828
3829
3830
3831 class PornotubeIE(InfoExtractor):
3832     """Information extractor for pornotube.com."""
3833     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3834
3835     def _real_extract(self, url):
3836         mobj = re.match(self._VALID_URL, url)
3837         if mobj is None:
3838             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3839             return
3840
3841         video_id = mobj.group('videoid')
3842         video_title = mobj.group('title')
3843
3844         # Get webpage content
3845         webpage = self._download_webpage(url, video_id)
3846
3847         # Get the video URL
3848         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3849         result = re.search(VIDEO_URL_RE, webpage)
3850         if result is None:
3851             self._downloader.trouble(u'ERROR: unable to extract video url')
3852             return
3853         video_url = compat_urllib_parse.unquote(result.group('url'))
3854
3855         #Get the uploaded date
3856         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3857         result = re.search(VIDEO_UPLOADED_RE, webpage)
3858         if result is None:
3859             self._downloader.trouble(u'ERROR: unable to extract video title')
3860             return
3861         upload_date = result.group('date')
3862
3863         info = {'id': video_id,
3864                 'url': video_url,
3865                 'uploader': None,
3866                 'upload_date': upload_date,
3867                 'title': video_title,
3868                 'ext': 'flv',
3869                 'format': 'flv'}
3870
3871         return [info]
3872
3873 class YouJizzIE(InfoExtractor):
3874     """Information extractor for youjizz.com."""
3875     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3876
3877     def _real_extract(self, url):
3878         mobj = re.match(self._VALID_URL, url)
3879         if mobj is None:
3880             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3881             return
3882
3883         video_id = mobj.group('videoid')
3884
3885         # Get webpage content
3886         webpage = self._download_webpage(url, video_id)
3887
3888         # Get the video title
3889         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3890         if result is None:
3891             raise ExtractorError(u'ERROR: unable to extract video title')
3892         video_title = result.group('title').strip()
3893
3894         # Get the embed page
3895         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3896         if result is None:
3897             raise ExtractorError(u'ERROR: unable to extract embed page')
3898
3899         embed_page_url = result.group(0).strip()
3900         video_id = result.group('videoid')
3901
3902         webpage = self._download_webpage(embed_page_url, video_id)
3903
3904         # Get the video URL
3905         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3906         if result is None:
3907             raise ExtractorError(u'ERROR: unable to extract video url')
3908         video_url = result.group('source')
3909
3910         info = {'id': video_id,
3911                 'url': video_url,
3912                 'title': video_title,
3913                 'ext': 'flv',
3914                 'format': 'flv',
3915                 'player_url': embed_page_url}
3916
3917         return [info]
3918
3919 class EightTracksIE(InfoExtractor):
3920     IE_NAME = '8tracks'
3921     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3922
3923     def _real_extract(self, url):
3924         mobj = re.match(self._VALID_URL, url)
3925         if mobj is None:
3926             raise ExtractorError(u'Invalid URL: %s' % url)
3927         playlist_id = mobj.group('id')
3928
3929         webpage = self._download_webpage(url, playlist_id)
3930
3931         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3932         if not m:
3933             raise ExtractorError(u'Cannot find trax information')
3934         json_like = m.group(1)
3935         data = json.loads(json_like)
3936
3937         session = str(random.randint(0, 1000000000))
3938         mix_id = data['id']
3939         track_count = data['tracks_count']
3940         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3941         next_url = first_url
3942         res = []
3943         for i in itertools.count():
3944             api_json = self._download_webpage(next_url, playlist_id,
3945                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3946                 errnote=u'Failed to download song information')
3947             api_data = json.loads(api_json)
3948             track_data = api_data[u'set']['track']
3949             info = {
3950                 'id': track_data['id'],
3951                 'url': track_data['track_file_stream_url'],
3952                 'title': track_data['performer'] + u' - ' + track_data['name'],
3953                 'raw_title': track_data['name'],
3954                 'uploader_id': data['user']['login'],
3955                 'ext': 'm4a',
3956             }
3957             res.append(info)
3958             if api_data['set']['at_last_track']:
3959                 break
3960             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3961         return res
3962
3963 class KeekIE(InfoExtractor):
3964     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3965     IE_NAME = u'keek'
3966
3967     def _real_extract(self, url):
3968         m = re.match(self._VALID_URL, url)
3969         video_id = m.group('videoID')
3970         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3971         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3972         webpage = self._download_webpage(url, video_id)
3973         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3974         title = unescapeHTML(m.group('title'))
3975         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3976         uploader = unescapeHTML(m.group('uploader'))
3977         info = {
3978                 'id':video_id,
3979                 'url':video_url,
3980                 'ext': 'mp4',
3981                 'title': title,
3982                 'thumbnail': thumbnail,
3983                 'uploader': uploader
3984         }
3985         return [info]
3986
3987 class TEDIE(InfoExtractor):
3988     _VALID_URL=r'''http://www.ted.com/
3989                    (
3990                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3991                         |
3992                         ((?P<type_talk>talks)) # We have a simple talk
3993                    )
3994                    /(?P<name>\w+) # Here goes the name and then ".html"
3995                    '''
3996
3997     @classmethod
3998     def suitable(cls, url):
3999         """Receives a URL and returns True if suitable for this IE."""
4000         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4001
4002     def _real_extract(self, url):
4003         m=re.match(self._VALID_URL, url, re.VERBOSE)
4004         if m.group('type_talk'):
4005             return [self._talk_info(url)]
4006         else :
4007             playlist_id=m.group('playlist_id')
4008             name=m.group('name')
4009             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4010             return self._playlist_videos_info(url,name,playlist_id)
4011
4012     def _talk_video_link(self,mediaSlug):
4013         '''Returns the video link for that mediaSlug'''
4014         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4015
4016     def _playlist_videos_info(self,url,name,playlist_id=0):
4017         '''Returns the videos of the playlist'''
4018         video_RE=r'''
4019                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4020                      ([.\s]*?)data-playlist_item_id="(\d+)"
4021                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4022                      '''
4023         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4024         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4025         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4026         m_names=re.finditer(video_name_RE,webpage)
4027         info=[]
4028         for m_video, m_name in zip(m_videos,m_names):
4029             video_id=m_video.group('video_id')
4030             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4031             info.append(self._talk_info(talk_url,video_id))
4032         return info
4033
4034     def _talk_info(self, url, video_id=0):
4035         """Return the video for the talk in the url"""
4036         m=re.match(self._VALID_URL, url,re.VERBOSE)
4037         videoName=m.group('name')
4038         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4039         # If the url includes the language we get the title translated
4040         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4041         title=re.search(title_RE, webpage).group('title')
4042         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4043                         "id":(?P<videoID>[\d]+).*?
4044                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4045         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4046         thumb_match=re.search(thumb_RE,webpage)
4047         info_match=re.search(info_RE,webpage,re.VERBOSE)
4048         video_id=info_match.group('videoID')
4049         mediaSlug=info_match.group('mediaSlug')
4050         video_url=self._talk_video_link(mediaSlug)
4051         info = {
4052                 'id': video_id,
4053                 'url': video_url,
4054                 'ext': 'mp4',
4055                 'title': title,
4056                 'thumbnail': thumb_match.group('thumbnail')
4057                 }
4058         return info
4059
4060 class MySpassIE(InfoExtractor):
4061     _VALID_URL = r'http://www.myspass.de/.*'
4062
4063     def _real_extract(self, url):
4064         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4065
4066         # video id is the last path element of the URL
4067         # usually there is a trailing slash, so also try the second but last
4068         url_path = compat_urllib_parse_urlparse(url).path
4069         url_parent_path, video_id = os.path.split(url_path)
4070         if not video_id:
4071             _, video_id = os.path.split(url_parent_path)
4072
4073         # get metadata
4074         metadata_url = META_DATA_URL_TEMPLATE % video_id
4075         metadata_text = self._download_webpage(metadata_url, video_id)
4076         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4077
4078         # extract values from metadata
4079         url_flv_el = metadata.find('url_flv')
4080         if url_flv_el is None:
4081             self._downloader.trouble(u'ERROR: unable to extract download url')
4082             return
4083         video_url = url_flv_el.text
4084         extension = os.path.splitext(video_url)[1][1:]
4085         title_el = metadata.find('title')
4086         if title_el is None:
4087             self._downloader.trouble(u'ERROR: unable to extract title')
4088             return
4089         title = title_el.text
4090         format_id_el = metadata.find('format_id')
4091         if format_id_el is None:
4092             format = ext
4093         else:
4094             format = format_id_el.text
4095         description_el = metadata.find('description')
4096         if description_el is not None:
4097             description = description_el.text
4098         else:
4099             description = None
4100         imagePreview_el = metadata.find('imagePreview')
4101         if imagePreview_el is not None:
4102             thumbnail = imagePreview_el.text
4103         else:
4104             thumbnail = None
4105         info = {
4106             'id': video_id,
4107             'url': video_url,
4108             'title': title,
4109             'ext': extension,
4110             'format': format,
4111             'thumbnail': thumbnail,
4112             'description': description
4113         }
4114         return [info]
4115
4116 def gen_extractors():
4117     """ Return a list of an instance of every supported extractor.
4118     The order does matter; the first extractor matched is the one handling the URL.
4119     """
4120     return [
4121         YoutubePlaylistIE(),
4122         YoutubeChannelIE(),
4123         YoutubeUserIE(),
4124         YoutubeSearchIE(),
4125         YoutubeIE(),
4126         MetacafeIE(),
4127         DailymotionIE(),
4128         GoogleSearchIE(),
4129         PhotobucketIE(),
4130         YahooIE(),
4131         YahooSearchIE(),
4132         DepositFilesIE(),
4133         FacebookIE(),
4134         BlipTVUserIE(),
4135         BlipTVIE(),
4136         VimeoIE(),
4137         MyVideoIE(),
4138         ComedyCentralIE(),
4139         EscapistIE(),
4140         CollegeHumorIE(),
4141         XVideosIE(),
4142         SoundcloudIE(),
4143         InfoQIE(),
4144         MixcloudIE(),
4145         StanfordOpenClassroomIE(),
4146         MTVIE(),
4147         YoukuIE(),
4148         XNXXIE(),
4149         YouJizzIE(),
4150         PornotubeIE(),
4151         YouPornIE(),
4152         GooglePlusIE(),
4153         ArteTvIE(),
4154         NBAIE(),
4155         JustinTVIE(),
4156         FunnyOrDieIE(),
4157         SteamIE(),
4158         UstreamIE(),
4159         RBMARadioIE(),
4160         EightTracksIE(),
4161         KeekIE(),
4162         TEDIE(),
4163         MySpassIE(),
4164         GenericIE()
4165     ]
4166
4167