Use extract_info in YoutubePlaylist and YoutubeSearch
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The .srt file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         webpage_bytes = urlh.read()
130         return webpage_bytes.decode('utf-8', 'replace')
131
132
133 class YoutubeIE(InfoExtractor):
134     """Information extractor for youtube.com."""
135
136     _VALID_URL = r"""^
137                      (
138                          (?:https?://)?                                       # http(s):// (optional)
139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
142                          (?:                                                  # the various things that can precede the ID:
143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
144                              |(?:                                             # or the v= param in all its forms
145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
148                                  v=
149                              )
150                          )?                                                   # optional -> youtube.com/xxxx is OK
151                      )?                                                       # all until now is optional -> you can pass the naked ID
152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
153                      (?(1).+)?                                                # if we found the ID, everything can follow
154                      $"""
155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159     _NETRC_MACHINE = 'youtube'
160     # Listed in order of quality
161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163     _video_extensions = {
164         '13': '3gp',
165         '17': 'mp4',
166         '18': 'mp4',
167         '22': 'mp4',
168         '37': 'mp4',
169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
170         '43': 'webm',
171         '44': 'webm',
172         '45': 'webm',
173         '46': 'webm',
174     }
175     _video_dimensions = {
176         '5': '240x400',
177         '6': '???',
178         '13': '???',
179         '17': '144x176',
180         '18': '360x640',
181         '22': '720x1280',
182         '34': '360x640',
183         '35': '480x854',
184         '37': '1080x1920',
185         '38': '3072x4096',
186         '43': '360x640',
187         '44': '480x854',
188         '45': '720x1280',
189         '46': '1080x1920',
190     }
191     IE_NAME = u'youtube'
192
193     @classmethod
194     def suitable(cls, url):
195         """Receives a URL and returns True if suitable for this IE."""
196         if YoutubePlaylistIE.suitable(url): return False
197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
198
199     def report_lang(self):
200         """Report attempt to set language."""
201         self._downloader.to_screen(u'[youtube] Setting language')
202
203     def report_login(self):
204         """Report attempt to log in."""
205         self._downloader.to_screen(u'[youtube] Logging in')
206
207     def report_age_confirmation(self):
208         """Report attempt to confirm age."""
209         self._downloader.to_screen(u'[youtube] Confirming age')
210
211     def report_video_webpage_download(self, video_id):
212         """Report attempt to download video webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
214
215     def report_video_info_webpage_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
218
219     def report_video_subtitles_download(self, video_id):
220         """Report attempt to download video info webpage."""
221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
222
223     def report_information_extraction(self, video_id):
224         """Report attempt to extract video information."""
225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227     def report_unavailable_format(self, video_id, format):
228         """Report extracted video URL."""
229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231     def report_rtmp_download(self):
232         """Indicate the download will use the RTMP protocol."""
233         self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235     def _closed_captions_xml_to_srt(self, xml_string):
236         srt = ''
237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238         # TODO parse xml instead of regex
239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
240             if not dur: dur = '4'
241             start = float(start)
242             end = start + float(dur)
243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245             caption = unescapeHTML(caption)
246             caption = unescapeHTML(caption) # double cycle, intentional
247             srt += str(n+1) + '\n'
248             srt += start + ' --> ' + end + '\n'
249             srt += caption + '\n\n'
250         return srt
251
252     def _extract_subtitles(self, video_id):
253         self.report_video_subtitles_download(video_id)
254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
255         try:
256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261         if not srt_lang_list:
262             return (u'WARNING: video has no closed captions', None)
263         if self._downloader.params.get('subtitleslang', False):
264             srt_lang = self._downloader.params.get('subtitleslang')
265         elif 'en' in srt_lang_list:
266             srt_lang = 'en'
267         else:
268             srt_lang = list(srt_lang_list.keys())[0]
269         if not srt_lang in srt_lang_list:
270             return (u'WARNING: no closed captions found in the specified language', None)
271         params = compat_urllib_parse.urlencode({
272             'lang': srt_lang,
273             'name': srt_lang_list[srt_lang].encode('utf-8'),
274             'v': video_id,
275         })
276         url = 'http://www.youtube.com/api/timedtext?' + params
277         try:
278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281         if not srt_xml:
282             return (u'WARNING: Did not fetch video subtitles', None)
283         return (None, self._closed_captions_xml_to_srt(srt_xml))
284
285     def _print_formats(self, formats):
286         print('Available formats:')
287         for x in formats:
288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
289
290     def _real_initialize(self):
291         if self._downloader is None:
292             return
293
294         username = None
295         password = None
296         downloader_params = self._downloader.params
297
298         # Attempt to use provided username and password or .netrc data
299         if downloader_params.get('username', None) is not None:
300             username = downloader_params['username']
301             password = downloader_params['password']
302         elif downloader_params.get('usenetrc', False):
303             try:
304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305                 if info is not None:
306                     username = info[0]
307                     password = info[2]
308                 else:
309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310             except (IOError, netrc.NetrcParseError) as err:
311                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
312                 return
313
314         # Set language
315         request = compat_urllib_request.Request(self._LANG_URL)
316         try:
317             self.report_lang()
318             compat_urllib_request.urlopen(request).read()
319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
321             return
322
323         # No authentication to be performed
324         if username is None:
325             return
326
327         request = compat_urllib_request.Request(self._LOGIN_URL)
328         try:
329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
332             return
333
334         galx = None
335         dsh = None
336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           galx = match.group(1)
339
340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
341         if match:
342           dsh = match.group(1)
343
344         # Log in
345         login_form_strs = {
346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
347                 u'Email': username,
348                 u'GALX': galx,
349                 u'Passwd': password,
350                 u'PersistentCookie': u'yes',
351                 u'_utf8': u'霱',
352                 u'bgresponse': u'js_disabled',
353                 u'checkConnection': u'',
354                 u'checkedDomains': u'youtube',
355                 u'dnConn': u'',
356                 u'dsh': dsh,
357                 u'pstMsg': u'0',
358                 u'rmShown': u'1',
359                 u'secTok': u'',
360                 u'signIn': u'Sign in',
361                 u'timeStmp': u'',
362                 u'service': u'youtube',
363                 u'uilel': u'3',
364                 u'hl': u'en_US',
365         }
366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
367         # chokes on unicode
368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
371         try:
372             self.report_login()
373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375                 self._downloader.report_warning(u'unable to log in: bad username or password')
376                 return
377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
379             return
380
381         # Confirm age
382         age_form = {
383                 'next_url':     '/',
384                 'action_confirm':   'Confirm',
385                 }
386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
387         try:
388             self.report_age_confirmation()
389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
392             return
393
394     def _extract_id(self, url):
395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
396         if mobj is None:
397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
398             return
399         video_id = mobj.group(2)
400         return video_id
401
402     def _real_extract(self, url):
403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404         mobj = re.search(self._NEXT_URL_RE, url)
405         if mobj:
406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407         video_id = self._extract_id(url)
408
409         # Get video webpage
410         self.report_video_webpage_download(video_id)
411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412         request = compat_urllib_request.Request(url)
413         try:
414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
417             return
418
419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
420
421         # Attempt to extract SWF player URL
422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
423         if mobj is not None:
424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425         else:
426             player_url = None
427
428         # Get video info
429         self.report_video_info_webpage_download(video_id)
430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432                     % (video_id, el_type))
433             request = compat_urllib_request.Request(video_info_url)
434             try:
435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437                 video_info = compat_parse_qs(video_info_webpage)
438                 if 'token' in video_info:
439                     break
440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
442                 return
443         if 'token' not in video_info:
444             if 'reason' in video_info:
445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
446             else:
447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
448             return
449
450         # Check for "rental" videos
451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
453             return
454
455         # Start extracting information
456         self.report_information_extraction(video_id)
457
458         # uploader
459         if 'author' not in video_info:
460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
461             return
462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
463
464         # uploader_id
465         video_uploader_id = None
466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
467         if mobj is not None:
468             video_uploader_id = mobj.group(1)
469         else:
470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
471
472         # title
473         if 'title' not in video_info:
474             self._downloader.trouble(u'ERROR: unable to extract video title')
475             return
476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
477
478         # thumbnail image
479         if 'thumbnail_url' not in video_info:
480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
481             video_thumbnail = ''
482         else:   # don't panic if we can't find it
483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
484
485         # upload date
486         upload_date = None
487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
488         if mobj is not None:
489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491             for expression in format_expressions:
492                 try:
493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494                 except:
495                     pass
496
497         # description
498         video_description = get_element_by_id("eow-description", video_webpage)
499         if video_description:
500             video_description = clean_html(video_description)
501         else:
502             video_description = ''
503
504         # closed captions
505         video_subtitles = None
506         if self._downloader.params.get('writesubtitles', False):
507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
508             if srt_error:
509                 self._downloader.trouble(srt_error)
510
511         if 'length_seconds' not in video_info:
512             self._downloader.trouble(u'WARNING: unable to extract video duration')
513             video_duration = ''
514         else:
515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
516
517         # token
518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
519
520         # Decide which formats to download
521         req_format = self._downloader.params.get('format', None)
522
523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524             self.report_rtmp_download()
525             video_url_list = [(None, video_info['conn'][0])]
526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
531
532             format_limit = self._downloader.params.get('format_limit', None)
533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534             if format_limit is not None and format_limit in available_formats:
535                 format_list = available_formats[available_formats.index(format_limit):]
536             else:
537                 format_list = available_formats
538             existing_formats = [x for x in format_list if x in url_map]
539             if len(existing_formats) == 0:
540                 self._downloader.trouble(u'ERROR: no known formats available for video')
541                 return
542             if self._downloader.params.get('listformats', None):
543                 self._print_formats(existing_formats)
544                 return
545             if req_format is None or req_format == 'best':
546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547             elif req_format == 'worst':
548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549             elif req_format in ('-1', 'all'):
550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
551             else:
552                 # Specific formats. We pick the first in a slash-delimeted sequence.
553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554                 req_formats = req_format.split('/')
555                 video_url_list = None
556                 for rf in req_formats:
557                     if rf in url_map:
558                         video_url_list = [(rf, url_map[rf])]
559                         break
560                 if video_url_list is None:
561                     self._downloader.trouble(u'ERROR: requested format not available')
562                     return
563         else:
564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
565             return
566
567         results = []
568         for format_param, video_real_url in video_url_list:
569             # Extension
570             video_extension = self._video_extensions.get(format_param, 'flv')
571
572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573                                               self._video_dimensions.get(format_param, '???'))
574
575             results.append({
576                 'id':       video_id,
577                 'url':      video_real_url,
578                 'uploader': video_uploader,
579                 'uploader_id': video_uploader_id,
580                 'upload_date':  upload_date,
581                 'title':    video_title,
582                 'ext':      video_extension,
583                 'format':   video_format,
584                 'thumbnail':    video_thumbnail,
585                 'description':  video_description,
586                 'player_url':   player_url,
587                 'subtitles':    video_subtitles,
588                 'duration':     video_duration
589             })
590         return results
591
592
593 class MetacafeIE(InfoExtractor):
594     """Information Extractor for metacafe.com."""
595
596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599     IE_NAME = u'metacafe'
600
601     def __init__(self, downloader=None):
602         InfoExtractor.__init__(self, downloader)
603
604     def report_disclaimer(self):
605         """Report disclaimer retrieval."""
606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
607
608     def report_age_confirmation(self):
609         """Report attempt to confirm age."""
610         self._downloader.to_screen(u'[metacafe] Confirming age')
611
612     def report_download_webpage(self, video_id):
613         """Report webpage download."""
614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
615
616     def report_extraction(self, video_id):
617         """Report information extraction."""
618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
619
620     def _real_initialize(self):
621         # Retrieve disclaimer
622         request = compat_urllib_request.Request(self._DISCLAIMER)
623         try:
624             self.report_disclaimer()
625             disclaimer = compat_urllib_request.urlopen(request).read()
626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
628             return
629
630         # Confirm age
631         disclaimer_form = {
632             'filters': '0',
633             'submit': "Continue - I'm over 18",
634             }
635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
636         try:
637             self.report_age_confirmation()
638             disclaimer = compat_urllib_request.urlopen(request).read()
639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
641             return
642
643     def _real_extract(self, url):
644         # Extract id and simplified title from URL
645         mobj = re.match(self._VALID_URL, url)
646         if mobj is None:
647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648             return
649
650         video_id = mobj.group(1)
651
652         # Check if video comes from YouTube
653         mobj2 = re.match(r'^yt-(.*)$', video_id)
654         if mobj2 is not None:
655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
656             return
657
658         # Retrieve video webpage to extract further information
659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
660         try:
661             self.report_download_webpage(video_id)
662             webpage = compat_urllib_request.urlopen(request).read()
663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665             return
666
667         # Extract URL, uploader and title from webpage
668         self.report_extraction(video_id)
669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
670         if mobj is not None:
671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672             video_extension = mediaURL[-3:]
673
674             # Extract gdaKey if available
675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
676             if mobj is None:
677                 video_url = mediaURL
678             else:
679                 gdaKey = mobj.group(1)
680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
681         else:
682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
683             if mobj is None:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             vardict = compat_parse_qs(mobj.group(1))
687             if 'mediaData' not in vardict:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
691             if mobj is None:
692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
693                 return
694             mediaURL = mobj.group(1).replace('\\/', '/')
695             video_extension = mediaURL[-3:]
696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
697
698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
699         if mobj is None:
700             self._downloader.trouble(u'ERROR: unable to extract title')
701             return
702         video_title = mobj.group(1).decode('utf-8')
703
704         mobj = re.search(r'submitter=(.*?);', webpage)
705         if mobj is None:
706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
707             return
708         video_uploader = mobj.group(1)
709
710         return [{
711             'id':       video_id.decode('utf-8'),
712             'url':      video_url.decode('utf-8'),
713             'uploader': video_uploader.decode('utf-8'),
714             'upload_date':  None,
715             'title':    video_title,
716             'ext':      video_extension.decode('utf-8'),
717         }]
718
719
720 class DailymotionIE(InfoExtractor):
721     """Information Extractor for Dailymotion"""
722
723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724     IE_NAME = u'dailymotion'
725     _WORKING = False
726
727     def __init__(self, downloader=None):
728         InfoExtractor.__init__(self, downloader)
729
730     def report_extraction(self, video_id):
731         """Report information extraction."""
732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
733
734     def _real_extract(self, url):
735         # Extract id and simplified title from URL
736         mobj = re.match(self._VALID_URL, url)
737         if mobj is None:
738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
739             return
740
741         video_id = mobj.group(1).split('_')[0].split('?')[0]
742
743         video_extension = 'mp4'
744
745         # Retrieve video webpage to extract further information
746         request = compat_urllib_request.Request(url)
747         request.add_header('Cookie', 'family_filter=off')
748         webpage = self._download_webpage(request, video_id)
749
750         # Extract URL, uploader and title from webpage
751         self.report_extraction(video_id)
752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
753         if mobj is None:
754             self._downloader.trouble(u'ERROR: unable to extract media URL')
755             return
756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
757
758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
759             if key in flashvars:
760                 max_quality = key
761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
762                 break
763         else:
764             self._downloader.trouble(u'ERROR: unable to extract video URL')
765             return
766
767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
768         if mobj is None:
769             self._downloader.trouble(u'ERROR: unable to extract video URL')
770             return
771
772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
773
774         # TODO: support choosing qualities
775
776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
777         if mobj is None:
778             self._downloader.trouble(u'ERROR: unable to extract title')
779             return
780         video_title = unescapeHTML(mobj.group('title'))
781
782         video_uploader = None
783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
784         if mobj is None:
785             # lookin for official user
786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787             if mobj_official is None:
788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
789             else:
790                 video_uploader = mobj_official.group(1)
791         else:
792             video_uploader = mobj.group(1)
793
794         video_upload_date = None
795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
796         if mobj is not None:
797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798
799         return [{
800             'id':       video_id,
801             'url':      video_url,
802             'uploader': video_uploader,
803             'upload_date':  video_upload_date,
804             'title':    video_title,
805             'ext':      video_extension,
806         }]
807
808
809 class PhotobucketIE(InfoExtractor):
810     """Information extractor for photobucket.com."""
811
812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813     IE_NAME = u'photobucket'
814
815     def __init__(self, downloader=None):
816         InfoExtractor.__init__(self, downloader)
817
818     def report_download_webpage(self, video_id):
819         """Report webpage download."""
820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
821
822     def report_extraction(self, video_id):
823         """Report information extraction."""
824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
825
826     def _real_extract(self, url):
827         # Extract id from URL
828         mobj = re.match(self._VALID_URL, url)
829         if mobj is None:
830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
831             return
832
833         video_id = mobj.group(1)
834
835         video_extension = 'flv'
836
837         # Retrieve video webpage to extract further information
838         request = compat_urllib_request.Request(url)
839         try:
840             self.report_download_webpage(video_id)
841             webpage = compat_urllib_request.urlopen(request).read()
842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
844             return
845
846         # Extract URL, uploader, and title from webpage
847         self.report_extraction(video_id)
848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
849         if mobj is None:
850             self._downloader.trouble(u'ERROR: unable to extract media URL')
851             return
852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
853
854         video_url = mediaURL
855
856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
857         if mobj is None:
858             self._downloader.trouble(u'ERROR: unable to extract title')
859             return
860         video_title = mobj.group(1).decode('utf-8')
861
862         video_uploader = mobj.group(2).decode('utf-8')
863
864         return [{
865             'id':       video_id.decode('utf-8'),
866             'url':      video_url.decode('utf-8'),
867             'uploader': video_uploader,
868             'upload_date':  None,
869             'title':    video_title,
870             'ext':      video_extension.decode('utf-8'),
871         }]
872
873
874 class YahooIE(InfoExtractor):
875     """Information extractor for video.yahoo.com."""
876
877     _WORKING = False
878     # _VALID_URL matches all Yahoo! Video URLs
879     # _VPAGE_URL matches only the extractable '/watch/' URLs
880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882     IE_NAME = u'video.yahoo'
883
884     def __init__(self, downloader=None):
885         InfoExtractor.__init__(self, downloader)
886
887     def report_download_webpage(self, video_id):
888         """Report webpage download."""
889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
890
891     def report_extraction(self, video_id):
892         """Report information extraction."""
893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
894
895     def _real_extract(self, url, new_video=True):
896         # Extract ID from URL
897         mobj = re.match(self._VALID_URL, url)
898         if mobj is None:
899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
900             return
901
902         video_id = mobj.group(2)
903         video_extension = 'flv'
904
905         # Rewrite valid but non-extractable URLs as
906         # extractable English language /watch/ URLs
907         if re.match(self._VPAGE_URL, url) is None:
908             request = compat_urllib_request.Request(url)
909             try:
910                 webpage = compat_urllib_request.urlopen(request).read()
911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
913                 return
914
915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
916             if mobj is None:
917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
918                 return
919             yahoo_id = mobj.group(1)
920
921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
922             if mobj is None:
923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
924                 return
925             yahoo_vid = mobj.group(1)
926
927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928             return self._real_extract(url, new_video=False)
929
930         # Retrieve video webpage to extract further information
931         request = compat_urllib_request.Request(url)
932         try:
933             self.report_download_webpage(video_id)
934             webpage = compat_urllib_request.urlopen(request).read()
935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
937             return
938
939         # Extract uploader and title from webpage
940         self.report_extraction(video_id)
941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
942         if mobj is None:
943             self._downloader.trouble(u'ERROR: unable to extract video title')
944             return
945         video_title = mobj.group(1).decode('utf-8')
946
947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
948         if mobj is None:
949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
950             return
951         video_uploader = mobj.group(1).decode('utf-8')
952
953         # Extract video thumbnail
954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
955         if mobj is None:
956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
957             return
958         video_thumbnail = mobj.group(1).decode('utf-8')
959
960         # Extract video description
961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: unable to extract video description')
964             return
965         video_description = mobj.group(1).decode('utf-8')
966         if not video_description:
967             video_description = 'No description available.'
968
969         # Extract video height and width
970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
971         if mobj is None:
972             self._downloader.trouble(u'ERROR: unable to extract video height')
973             return
974         yv_video_height = mobj.group(1)
975
976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
977         if mobj is None:
978             self._downloader.trouble(u'ERROR: unable to extract video width')
979             return
980         yv_video_width = mobj.group(1)
981
982         # Retrieve video playlist to extract media URL
983         # I'm not completely sure what all these options are, but we
984         # seem to need most of them, otherwise the server sends a 401.
985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
990         try:
991             self.report_download_webpage(video_id)
992             webpage = compat_urllib_request.urlopen(request).read()
993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995             return
996
997         # Extract media URL from playlist XML
998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 self._downloader.trouble(u'[youtube] No video results')
1487                 return
1488
1489             new_ids = list(video['id'] for video in api_response['items'])
1490             video_ids += new_ids
1491
1492             limit = min(n, api_response['totalItems'])
1493             pagenum += 1
1494
1495         if len(video_ids) > n:
1496             video_ids = video_ids[:n]
1497         for id in video_ids:
1498             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499         return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503     """Information Extractor for Google Video search queries."""
1504     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508     _max_google_results = 1000
1509     IE_NAME = u'video.google:search'
1510
1511     def __init__(self, downloader=None):
1512         InfoExtractor.__init__(self, downloader)
1513
1514     def report_download_page(self, query, pagenum):
1515         """Report attempt to download playlist page with given number."""
1516         query = query.decode(preferredencoding())
1517         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519     def _real_extract(self, query):
1520         mobj = re.match(self._VALID_URL, query)
1521         if mobj is None:
1522             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523             return
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_google_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539                     return
1540                 elif n > self._max_google_results:
1541                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542                     n = self._max_google_results
1543                 self._download_n_results(query, n)
1544                 return
1545             except ValueError: # parsing prefix as integer fails
1546                 self._download_n_results(query, 1)
1547                 return
1548
1549     def _download_n_results(self, query, n):
1550         """Downloads a specified number of results for a query"""
1551
1552         video_ids = []
1553         pagenum = 0
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in video_ids:
1569                     video_ids.append(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585     """Information Extractor for Yahoo! Video search queries."""
1586
1587     _WORKING = False
1588     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591     _MORE_PAGES_INDICATOR = r'\s*Next'
1592     _max_yahoo_results = 1000
1593     IE_NAME = u'video.yahoo:search'
1594
1595     def __init__(self, downloader=None):
1596         InfoExtractor.__init__(self, downloader)
1597
1598     def report_download_page(self, query, pagenum):
1599         """Report attempt to download playlist page with given number."""
1600         query = query.decode(preferredencoding())
1601         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603     def _real_extract(self, query):
1604         mobj = re.match(self._VALID_URL, query)
1605         if mobj is None:
1606             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607             return
1608
1609         prefix, query = query.split(':')
1610         prefix = prefix[8:]
1611         query = query.encode('utf-8')
1612         if prefix == '':
1613             self._download_n_results(query, 1)
1614             return
1615         elif prefix == 'all':
1616             self._download_n_results(query, self._max_yahoo_results)
1617             return
1618         else:
1619             try:
1620                 n = int(prefix)
1621                 if n <= 0:
1622                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                     return
1624                 elif n > self._max_yahoo_results:
1625                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626                     n = self._max_yahoo_results
1627                 self._download_n_results(query, n)
1628                 return
1629             except ValueError: # parsing prefix as integer fails
1630                 self._download_n_results(query, 1)
1631                 return
1632
1633     def _download_n_results(self, query, n):
1634         """Downloads a specified number of results for a query"""
1635
1636         video_ids = []
1637         already_seen = set()
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(query, pagenum)
1642             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643             request = compat_urllib_request.Request(result_url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read()
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                 video_id = mobj.group(1)
1653                 if video_id not in already_seen:
1654                     video_ids.append(video_id)
1655                     already_seen.add(video_id)
1656                     if len(video_ids) == n:
1657                         # Specified n videos reached
1658                         for id in video_ids:
1659                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                         return
1661
1662             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                 for id in video_ids:
1664                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665                 return
1666
1667             pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671     """Information Extractor for YouTube playlists."""
1672
1673     _VALID_URL = r"""(?:
1674                         (?:https?://)?
1675                         (?:\w+\.)?
1676                         youtube\.com/
1677                         (?:
1678                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679                            \? (?:.*?&)*? (?:p|a|list)=
1680                         |  user/.*?/user/
1681                         |  p/
1682                         |  user/.*?#[pg]/c/
1683                         )
1684                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685                         .*
1686                      |
1687                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688                      )"""
1689     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690     _MAX_RESULTS = 50
1691     IE_NAME = u'youtube:playlist'
1692
1693     def __init__(self, downloader=None):
1694         InfoExtractor.__init__(self, downloader)
1695
1696     @classmethod
1697     def suitable(cls, url):
1698         """Receives a URL and returns True if suitable for this IE."""
1699         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701     def report_download_page(self, playlist_id, pagenum):
1702         """Report attempt to download playlist page with given number."""
1703         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract playlist id
1707         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download playlist videos from API
1713         playlist_id = mobj.group(1) or mobj.group(2)
1714         page_num = 1
1715         videos = []
1716
1717         while True:
1718             self.report_download_page(playlist_id, page_num)
1719
1720             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721             try:
1722                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if not 'feed' in response or not 'entry' in response['feed']:
1734                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735                 return
1736             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737                         for entry in response['feed']['entry']
1738                         if 'content' in entry ]
1739
1740             if len(response['feed']['entry']) < self._MAX_RESULTS:
1741                 break
1742             page_num += 1
1743
1744         videos = [v[1] for v in sorted(videos)]
1745         total = len(videos)
1746
1747         playliststart = self._downloader.params.get('playliststart', 1) - 1
1748         playlistend = self._downloader.params.get('playlistend', -1)
1749         if playlistend == -1:
1750             videos = videos[playliststart:]
1751         else:
1752             videos = videos[playliststart:playlistend]
1753
1754         if len(videos) == total:
1755             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1756         else:
1757             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1758
1759         return self._downloader.extract_info_iterable(videos)
1760
1761
1762 class YoutubeChannelIE(InfoExtractor):
1763     """Information Extractor for YouTube channels."""
1764
1765     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1766     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1767     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1768     IE_NAME = u'youtube:channel'
1769
1770     def report_download_page(self, channel_id, pagenum):
1771         """Report attempt to download channel page with given number."""
1772         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1773
1774     def _real_extract(self, url):
1775         # Extract channel id
1776         mobj = re.match(self._VALID_URL, url)
1777         if mobj is None:
1778             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1779             return
1780
1781         # Download channel pages
1782         channel_id = mobj.group(1)
1783         video_ids = []
1784         pagenum = 1
1785
1786         while True:
1787             self.report_download_page(channel_id, pagenum)
1788             url = self._TEMPLATE_URL % (channel_id, pagenum)
1789             request = compat_urllib_request.Request(url)
1790             try:
1791                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794                 return
1795
1796             # Extract video identifiers
1797             ids_in_page = []
1798             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1799                 if mobj.group(1) not in ids_in_page:
1800                     ids_in_page.append(mobj.group(1))
1801             video_ids.extend(ids_in_page)
1802
1803             if self._MORE_PAGES_INDICATOR not in page:
1804                 break
1805             pagenum = pagenum + 1
1806
1807         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808
1809         for id in video_ids:
1810             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1811         return
1812
1813
1814 class YoutubeUserIE(InfoExtractor):
1815     """Information Extractor for YouTube users."""
1816
1817     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1818     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1819     _GDATA_PAGE_SIZE = 50
1820     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1821     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1822     IE_NAME = u'youtube:user'
1823
1824     def __init__(self, downloader=None):
1825         InfoExtractor.__init__(self, downloader)
1826
1827     def report_download_page(self, username, start_index):
1828         """Report attempt to download user page."""
1829         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1830                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1831
1832     def _real_extract(self, url):
1833         # Extract username
1834         mobj = re.match(self._VALID_URL, url)
1835         if mobj is None:
1836             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837             return
1838
1839         username = mobj.group(1)
1840
1841         # Download video ids using YouTube Data API. Result size per
1842         # query is limited (currently to 50 videos) so we need to query
1843         # page by page until there are no video ids - it means we got
1844         # all of them.
1845
1846         video_ids = []
1847         pagenum = 0
1848
1849         while True:
1850             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1851             self.report_download_page(username, start_index)
1852
1853             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1854
1855             try:
1856                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1857             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1859                 return
1860
1861             # Extract video identifiers
1862             ids_in_page = []
1863
1864             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1865                 if mobj.group(1) not in ids_in_page:
1866                     ids_in_page.append(mobj.group(1))
1867
1868             video_ids.extend(ids_in_page)
1869
1870             # A little optimization - if current page is not
1871             # "full", ie. does not contain PAGE_SIZE video ids then
1872             # we can assume that this page is the last one - there
1873             # are no more ids on further pages - no need to query
1874             # again.
1875
1876             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1877                 break
1878
1879             pagenum += 1
1880
1881         all_ids_count = len(video_ids)
1882         playliststart = self._downloader.params.get('playliststart', 1) - 1
1883         playlistend = self._downloader.params.get('playlistend', -1)
1884
1885         if playlistend == -1:
1886             video_ids = video_ids[playliststart:]
1887         else:
1888             video_ids = video_ids[playliststart:playlistend]
1889
1890         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1891                 (username, all_ids_count, len(video_ids)))
1892
1893         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1894         return self._downloader.extract_info_iterable(urls)
1895
1896
1897 class BlipTVUserIE(InfoExtractor):
1898     """Information Extractor for blip.tv users."""
1899
1900     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1901     _PAGE_SIZE = 12
1902     IE_NAME = u'blip.tv:user'
1903
1904     def __init__(self, downloader=None):
1905         InfoExtractor.__init__(self, downloader)
1906
1907     def report_download_page(self, username, pagenum):
1908         """Report attempt to download user page."""
1909         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1910                 (self.IE_NAME, username, pagenum))
1911
1912     def _real_extract(self, url):
1913         # Extract username
1914         mobj = re.match(self._VALID_URL, url)
1915         if mobj is None:
1916             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1917             return
1918
1919         username = mobj.group(1)
1920
1921         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1922
1923         request = compat_urllib_request.Request(url)
1924
1925         try:
1926             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1927             mobj = re.search(r'data-users-id="([^"]+)"', page)
1928             page_base = page_base % mobj.group(1)
1929         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1931             return
1932
1933
1934         # Download video ids using BlipTV Ajax calls. Result size per
1935         # query is limited (currently to 12 videos) so we need to query
1936         # page by page until there are no video ids - it means we got
1937         # all of them.
1938
1939         video_ids = []
1940         pagenum = 1
1941
1942         while True:
1943             self.report_download_page(username, pagenum)
1944             url = page_base + "&page=" + str(pagenum)
1945             request = compat_urllib_request.Request( url )
1946             try:
1947                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1950                 return
1951
1952             # Extract video identifiers
1953             ids_in_page = []
1954
1955             for mobj in re.finditer(r'href="/([^"]+)"', page):
1956                 if mobj.group(1) not in ids_in_page:
1957                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1958
1959             video_ids.extend(ids_in_page)
1960
1961             # A little optimization - if current page is not
1962             # "full", ie. does not contain PAGE_SIZE video ids then
1963             # we can assume that this page is the last one - there
1964             # are no more ids on further pages - no need to query
1965             # again.
1966
1967             if len(ids_in_page) < self._PAGE_SIZE:
1968                 break
1969
1970             pagenum += 1
1971
1972         all_ids_count = len(video_ids)
1973         playliststart = self._downloader.params.get('playliststart', 1) - 1
1974         playlistend = self._downloader.params.get('playlistend', -1)
1975
1976         if playlistend == -1:
1977             video_ids = video_ids[playliststart:]
1978         else:
1979             video_ids = video_ids[playliststart:playlistend]
1980
1981         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1982                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1983
1984         for video_id in video_ids:
1985             self._downloader.download([u'http://blip.tv/'+video_id])
1986
1987
1988 class DepositFilesIE(InfoExtractor):
1989     """Information extractor for depositfiles.com"""
1990
1991     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1992
1993     def report_download_webpage(self, file_id):
1994         """Report webpage download."""
1995         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1996
1997     def report_extraction(self, file_id):
1998         """Report information extraction."""
1999         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2000
2001     def _real_extract(self, url):
2002         file_id = url.split('/')[-1]
2003         # Rebuild url in english locale
2004         url = 'http://depositfiles.com/en/files/' + file_id
2005
2006         # Retrieve file webpage with 'Free download' button pressed
2007         free_download_indication = { 'gateway_result' : '1' }
2008         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2009         try:
2010             self.report_download_webpage(file_id)
2011             webpage = compat_urllib_request.urlopen(request).read()
2012         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2013             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2014             return
2015
2016         # Search for the real file URL
2017         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2018         if (mobj is None) or (mobj.group(1) is None):
2019             # Try to figure out reason of the error.
2020             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2021             if (mobj is not None) and (mobj.group(1) is not None):
2022                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2023                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2024             else:
2025                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2026             return
2027
2028         file_url = mobj.group(1)
2029         file_extension = os.path.splitext(file_url)[1][1:]
2030
2031         # Search for file title
2032         mobj = re.search(r'<b title="(.*?)">', webpage)
2033         if mobj is None:
2034             self._downloader.trouble(u'ERROR: unable to extract title')
2035             return
2036         file_title = mobj.group(1).decode('utf-8')
2037
2038         return [{
2039             'id':       file_id.decode('utf-8'),
2040             'url':      file_url.decode('utf-8'),
2041             'uploader': None,
2042             'upload_date':  None,
2043             'title':    file_title,
2044             'ext':      file_extension.decode('utf-8'),
2045         }]
2046
2047
2048 class FacebookIE(InfoExtractor):
2049     """Information Extractor for Facebook"""
2050
2051     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2052     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2053     _NETRC_MACHINE = 'facebook'
2054     IE_NAME = u'facebook'
2055
2056     def report_login(self):
2057         """Report attempt to log in."""
2058         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2059
2060     def _real_initialize(self):
2061         if self._downloader is None:
2062             return
2063
2064         useremail = None
2065         password = None
2066         downloader_params = self._downloader.params
2067
2068         # Attempt to use provided username and password or .netrc data
2069         if downloader_params.get('username', None) is not None:
2070             useremail = downloader_params['username']
2071             password = downloader_params['password']
2072         elif downloader_params.get('usenetrc', False):
2073             try:
2074                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2075                 if info is not None:
2076                     useremail = info[0]
2077                     password = info[2]
2078                 else:
2079                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2080             except (IOError, netrc.NetrcParseError) as err:
2081                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2082                 return
2083
2084         if useremail is None:
2085             return
2086
2087         # Log in
2088         login_form = {
2089             'email': useremail,
2090             'pass': password,
2091             'login': 'Log+In'
2092             }
2093         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2094         try:
2095             self.report_login()
2096             login_results = compat_urllib_request.urlopen(request).read()
2097             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2098                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2099                 return
2100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2102             return
2103
2104     def _real_extract(self, url):
2105         mobj = re.match(self._VALID_URL, url)
2106         if mobj is None:
2107             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2108             return
2109         video_id = mobj.group('ID')
2110
2111         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2112         webpage = self._download_webpage(url, video_id)
2113
2114         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2115         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2116         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2117         if not m:
2118             raise ExtractorError(u'Cannot parse data')
2119         data = dict(json.loads(m.group(1)))
2120         params_raw = compat_urllib_parse.unquote(data['params'])
2121         params = json.loads(params_raw)
2122         video_url = params['hd_src']
2123         if not video_url:
2124             video_url = params['sd_src']
2125         if not video_url:
2126             raise ExtractorError(u'Cannot find video URL')
2127         video_duration = int(params['video_duration'])
2128
2129         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2130         if not m:
2131             raise ExtractorError(u'Cannot find title in webpage')
2132         video_title = unescapeHTML(m.group(1))
2133
2134         info = {
2135             'id': video_id,
2136             'title': video_title,
2137             'url': video_url,
2138             'ext': 'mp4',
2139             'duration': video_duration,
2140             'thumbnail': params['thumbnail_src'],
2141         }
2142         return [info]
2143
2144
2145 class BlipTVIE(InfoExtractor):
2146     """Information extractor for blip.tv"""
2147
2148     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2149     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2150     IE_NAME = u'blip.tv'
2151
2152     def report_extraction(self, file_id):
2153         """Report information extraction."""
2154         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2155
2156     def report_direct_download(self, title):
2157         """Report information extraction."""
2158         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2159
2160     def _real_extract(self, url):
2161         mobj = re.match(self._VALID_URL, url)
2162         if mobj is None:
2163             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2164             return
2165
2166         urlp = compat_urllib_parse_urlparse(url)
2167         if urlp.path.startswith('/play/'):
2168             request = compat_urllib_request.Request(url)
2169             response = compat_urllib_request.urlopen(request)
2170             redirecturl = response.geturl()
2171             rurlp = compat_urllib_parse_urlparse(redirecturl)
2172             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2173             url = 'http://blip.tv/a/a-' + file_id
2174             return self._real_extract(url)
2175
2176
2177         if '?' in url:
2178             cchar = '&'
2179         else:
2180             cchar = '?'
2181         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2182         request = compat_urllib_request.Request(json_url)
2183         request.add_header('User-Agent', 'iTunes/10.6.1')
2184         self.report_extraction(mobj.group(1))
2185         info = None
2186         try:
2187             urlh = compat_urllib_request.urlopen(request)
2188             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2189                 basename = url.split('/')[-1]
2190                 title,ext = os.path.splitext(basename)
2191                 title = title.decode('UTF-8')
2192                 ext = ext.replace('.', '')
2193                 self.report_direct_download(title)
2194                 info = {
2195                     'id': title,
2196                     'url': url,
2197                     'uploader': None,
2198                     'upload_date': None,
2199                     'title': title,
2200                     'ext': ext,
2201                     'urlhandle': urlh
2202                 }
2203         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2204             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2205         if info is None: # Regular URL
2206             try:
2207                 json_code_bytes = urlh.read()
2208                 json_code = json_code_bytes.decode('utf-8')
2209             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2210                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2211                 return
2212
2213             try:
2214                 json_data = json.loads(json_code)
2215                 if 'Post' in json_data:
2216                     data = json_data['Post']
2217                 else:
2218                     data = json_data
2219
2220                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2221                 video_url = data['media']['url']
2222                 umobj = re.match(self._URL_EXT, video_url)
2223                 if umobj is None:
2224                     raise ValueError('Can not determine filename extension')
2225                 ext = umobj.group(1)
2226
2227                 info = {
2228                     'id': data['item_id'],
2229                     'url': video_url,
2230                     'uploader': data['display_name'],
2231                     'upload_date': upload_date,
2232                     'title': data['title'],
2233                     'ext': ext,
2234                     'format': data['media']['mimeType'],
2235                     'thumbnail': data['thumbnailUrl'],
2236                     'description': data['description'],
2237                     'player_url': data['embedUrl'],
2238                     'user_agent': 'iTunes/10.6.1',
2239                 }
2240             except (ValueError,KeyError) as err:
2241                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2242                 return
2243
2244         return [info]
2245
2246
2247 class MyVideoIE(InfoExtractor):
2248     """Information Extractor for myvideo.de."""
2249
2250     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2251     IE_NAME = u'myvideo'
2252
2253     def __init__(self, downloader=None):
2254         InfoExtractor.__init__(self, downloader)
2255
2256     def report_extraction(self, video_id):
2257         """Report information extraction."""
2258         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2259
2260     def _real_extract(self,url):
2261         mobj = re.match(self._VALID_URL, url)
2262         if mobj is None:
2263             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2264             return
2265
2266         video_id = mobj.group(1)
2267
2268         # Get video webpage
2269         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2270         webpage = self._download_webpage(webpage_url, video_id)
2271
2272         self.report_extraction(video_id)
2273         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2274                  webpage)
2275         if mobj is None:
2276             self._downloader.trouble(u'ERROR: unable to extract media URL')
2277             return
2278         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2279
2280         mobj = re.search('<title>([^<]+)</title>', webpage)
2281         if mobj is None:
2282             self._downloader.trouble(u'ERROR: unable to extract title')
2283             return
2284
2285         video_title = mobj.group(1)
2286
2287         return [{
2288             'id':       video_id,
2289             'url':      video_url,
2290             'uploader': None,
2291             'upload_date':  None,
2292             'title':    video_title,
2293             'ext':      u'flv',
2294         }]
2295
2296 class ComedyCentralIE(InfoExtractor):
2297     """Information extractor for The Daily Show and Colbert Report """
2298
2299     # urls can be abbreviations like :thedailyshow or :colbert
2300     # urls for episodes like:
2301     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2302     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2303     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2304     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2305                       |(https?://)?(www\.)?
2306                           (?P<showname>thedailyshow|colbertnation)\.com/
2307                          (full-episodes/(?P<episode>.*)|
2308                           (?P<clip>
2309                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2310                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2311                      $"""
2312
2313     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2314
2315     _video_extensions = {
2316         '3500': 'mp4',
2317         '2200': 'mp4',
2318         '1700': 'mp4',
2319         '1200': 'mp4',
2320         '750': 'mp4',
2321         '400': 'mp4',
2322     }
2323     _video_dimensions = {
2324         '3500': '1280x720',
2325         '2200': '960x540',
2326         '1700': '768x432',
2327         '1200': '640x360',
2328         '750': '512x288',
2329         '400': '384x216',
2330     }
2331
2332     @classmethod
2333     def suitable(cls, url):
2334         """Receives a URL and returns True if suitable for this IE."""
2335         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2336
2337     def report_extraction(self, episode_id):
2338         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2339
2340     def report_config_download(self, episode_id, media_id):
2341         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2342
2343     def report_index_download(self, episode_id):
2344         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2345
2346     def _print_formats(self, formats):
2347         print('Available formats:')
2348         for x in formats:
2349             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2350
2351
2352     def _real_extract(self, url):
2353         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2354         if mobj is None:
2355             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2356             return
2357
2358         if mobj.group('shortname'):
2359             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2360                 url = u'http://www.thedailyshow.com/full-episodes/'
2361             else:
2362                 url = u'http://www.colbertnation.com/full-episodes/'
2363             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2364             assert mobj is not None
2365
2366         if mobj.group('clip'):
2367             if mobj.group('showname') == 'thedailyshow':
2368                 epTitle = mobj.group('tdstitle')
2369             else:
2370                 epTitle = mobj.group('cntitle')
2371             dlNewest = False
2372         else:
2373             dlNewest = not mobj.group('episode')
2374             if dlNewest:
2375                 epTitle = mobj.group('showname')
2376             else:
2377                 epTitle = mobj.group('episode')
2378
2379         req = compat_urllib_request.Request(url)
2380         self.report_extraction(epTitle)
2381         try:
2382             htmlHandle = compat_urllib_request.urlopen(req)
2383             html = htmlHandle.read()
2384             webpage = html.decode('utf-8')
2385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2386             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2387             return
2388         if dlNewest:
2389             url = htmlHandle.geturl()
2390             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2391             if mobj is None:
2392                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2393                 return
2394             if mobj.group('episode') == '':
2395                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2396                 return
2397             epTitle = mobj.group('episode')
2398
2399         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2400
2401         if len(mMovieParams) == 0:
2402             # The Colbert Report embeds the information in a without
2403             # a URL prefix; so extract the alternate reference
2404             # and then add the URL prefix manually.
2405
2406             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2407             if len(altMovieParams) == 0:
2408                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2409                 return
2410             else:
2411                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2412
2413         uri = mMovieParams[0][1]
2414         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2415         self.report_index_download(epTitle)
2416         try:
2417             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2420             return
2421
2422         results = []
2423
2424         idoc = xml.etree.ElementTree.fromstring(indexXml)
2425         itemEls = idoc.findall('.//item')
2426         for partNum,itemEl in enumerate(itemEls):
2427             mediaId = itemEl.findall('./guid')[0].text
2428             shortMediaId = mediaId.split(':')[-1]
2429             showId = mediaId.split(':')[-2].replace('.com', '')
2430             officialTitle = itemEl.findall('./title')[0].text
2431             officialDate = itemEl.findall('./pubDate')[0].text
2432
2433             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2434                         compat_urllib_parse.urlencode({'uri': mediaId}))
2435             configReq = compat_urllib_request.Request(configUrl)
2436             self.report_config_download(epTitle, shortMediaId)
2437             try:
2438                 configXml = compat_urllib_request.urlopen(configReq).read()
2439             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2441                 return
2442
2443             cdoc = xml.etree.ElementTree.fromstring(configXml)
2444             turls = []
2445             for rendition in cdoc.findall('.//rendition'):
2446                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2447                 turls.append(finfo)
2448
2449             if len(turls) == 0:
2450                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2451                 continue
2452
2453             if self._downloader.params.get('listformats', None):
2454                 self._print_formats([i[0] for i in turls])
2455                 return
2456
2457             # For now, just pick the highest bitrate
2458             format,rtmp_video_url = turls[-1]
2459
2460             # Get the format arg from the arg stream
2461             req_format = self._downloader.params.get('format', None)
2462
2463             # Select format if we can find one
2464             for f,v in turls:
2465                 if f == req_format:
2466                     format, rtmp_video_url = f, v
2467                     break
2468
2469             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2470             if not m:
2471                 raise ExtractorError(u'Cannot transform RTMP url')
2472             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2473             video_url = base + m.group('finalid')
2474
2475             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2476             info = {
2477                 'id': shortMediaId,
2478                 'url': video_url,
2479                 'uploader': showId,
2480                 'upload_date': officialDate,
2481                 'title': effTitle,
2482                 'ext': 'mp4',
2483                 'format': format,
2484                 'thumbnail': None,
2485                 'description': officialTitle,
2486             }
2487             results.append(info)
2488
2489         return results
2490
2491
2492 class EscapistIE(InfoExtractor):
2493     """Information extractor for The Escapist """
2494
2495     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2496     IE_NAME = u'escapist'
2497
2498     def report_extraction(self, showName):
2499         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2500
2501     def report_config_download(self, showName):
2502         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2503
2504     def _real_extract(self, url):
2505         mobj = re.match(self._VALID_URL, url)
2506         if mobj is None:
2507             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2508             return
2509         showName = mobj.group('showname')
2510         videoId = mobj.group('episode')
2511
2512         self.report_extraction(showName)
2513         try:
2514             webPage = compat_urllib_request.urlopen(url)
2515             webPageBytes = webPage.read()
2516             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2517             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2518         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2519             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2520             return
2521
2522         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2523         description = unescapeHTML(descMatch.group(1))
2524         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2525         imgUrl = unescapeHTML(imgMatch.group(1))
2526         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2527         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2528         configUrlMatch = re.search('config=(.*)$', playerUrl)
2529         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2530
2531         self.report_config_download(showName)
2532         try:
2533             configJSON = compat_urllib_request.urlopen(configUrl)
2534             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2535             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2537             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2538             return
2539
2540         # Technically, it's JavaScript, not JSON
2541         configJSON = configJSON.replace("'", '"')
2542
2543         try:
2544             config = json.loads(configJSON)
2545         except (ValueError,) as err:
2546             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2547             return
2548
2549         playlist = config['playlist']
2550         videoUrl = playlist[1]['url']
2551
2552         info = {
2553             'id': videoId,
2554             'url': videoUrl,
2555             'uploader': showName,
2556             'upload_date': None,
2557             'title': showName,
2558             'ext': 'flv',
2559             'thumbnail': imgUrl,
2560             'description': description,
2561             'player_url': playerUrl,
2562         }
2563
2564         return [info]
2565
2566 class CollegeHumorIE(InfoExtractor):
2567     """Information extractor for collegehumor.com"""
2568
2569     _WORKING = False
2570     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2571     IE_NAME = u'collegehumor'
2572
2573     def report_manifest(self, video_id):
2574         """Report information extraction."""
2575         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2576
2577     def report_extraction(self, video_id):
2578         """Report information extraction."""
2579         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2580
2581     def _real_extract(self, url):
2582         mobj = re.match(self._VALID_URL, url)
2583         if mobj is None:
2584             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2585             return
2586         video_id = mobj.group('videoid')
2587
2588         info = {
2589             'id': video_id,
2590             'uploader': None,
2591             'upload_date': None,
2592         }
2593
2594         self.report_extraction(video_id)
2595         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2596         try:
2597             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2598         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2599             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2600             return
2601
2602         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2603         try:
2604             videoNode = mdoc.findall('./video')[0]
2605             info['description'] = videoNode.findall('./description')[0].text
2606             info['title'] = videoNode.findall('./caption')[0].text
2607             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2608             manifest_url = videoNode.findall('./file')[0].text
2609         except IndexError:
2610             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2611             return
2612
2613         manifest_url += '?hdcore=2.10.3'
2614         self.report_manifest(video_id)
2615         try:
2616             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2617         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2618             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2619             return
2620
2621         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2622         try:
2623             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2624             node_id = media_node.attrib['url']
2625             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2626         except IndexError as err:
2627             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2628             return
2629
2630         url_pr = compat_urllib_parse_urlparse(manifest_url)
2631         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2632
2633         info['url'] = url
2634         info['ext'] = 'f4f'
2635         return [info]
2636
2637
2638 class XVideosIE(InfoExtractor):
2639     """Information extractor for xvideos.com"""
2640
2641     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2642     IE_NAME = u'xvideos'
2643
2644     def report_extraction(self, video_id):
2645         """Report information extraction."""
2646         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2647
2648     def _real_extract(self, url):
2649         mobj = re.match(self._VALID_URL, url)
2650         if mobj is None:
2651             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2652             return
2653         video_id = mobj.group(1)
2654
2655         webpage = self._download_webpage(url, video_id)
2656
2657         self.report_extraction(video_id)
2658
2659
2660         # Extract video URL
2661         mobj = re.search(r'flv_url=(.+?)&', webpage)
2662         if mobj is None:
2663             self._downloader.trouble(u'ERROR: unable to extract video url')
2664             return
2665         video_url = compat_urllib_parse.unquote(mobj.group(1))
2666
2667
2668         # Extract title
2669         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2670         if mobj is None:
2671             self._downloader.trouble(u'ERROR: unable to extract video title')
2672             return
2673         video_title = mobj.group(1)
2674
2675
2676         # Extract video thumbnail
2677         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2678         if mobj is None:
2679             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2680             return
2681         video_thumbnail = mobj.group(0)
2682
2683         info = {
2684             'id': video_id,
2685             'url': video_url,
2686             'uploader': None,
2687             'upload_date': None,
2688             'title': video_title,
2689             'ext': 'flv',
2690             'thumbnail': video_thumbnail,
2691             'description': None,
2692         }
2693
2694         return [info]
2695
2696
2697 class SoundcloudIE(InfoExtractor):
2698     """Information extractor for soundcloud.com
2699        To access the media, the uid of the song and a stream token
2700        must be extracted from the page source and the script must make
2701        a request to media.soundcloud.com/crossdomain.xml. Then
2702        the media can be grabbed by requesting from an url composed
2703        of the stream token and uid
2704      """
2705
2706     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2707     IE_NAME = u'soundcloud'
2708
2709     def __init__(self, downloader=None):
2710         InfoExtractor.__init__(self, downloader)
2711
2712     def report_resolve(self, video_id):
2713         """Report information extraction."""
2714         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2715
2716     def report_extraction(self, video_id):
2717         """Report information extraction."""
2718         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2719
2720     def _real_extract(self, url):
2721         mobj = re.match(self._VALID_URL, url)
2722         if mobj is None:
2723             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2724             return
2725
2726         # extract uploader (which is in the url)
2727         uploader = mobj.group(1)
2728         # extract simple title (uploader + slug of song title)
2729         slug_title =  mobj.group(2)
2730         simple_title = uploader + u'-' + slug_title
2731
2732         self.report_resolve('%s/%s' % (uploader, slug_title))
2733
2734         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2735         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736         request = compat_urllib_request.Request(resolv_url)
2737         try:
2738             info_json_bytes = compat_urllib_request.urlopen(request).read()
2739             info_json = info_json_bytes.decode('utf-8')
2740         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2742             return
2743
2744         info = json.loads(info_json)
2745         video_id = info['id']
2746         self.report_extraction('%s/%s' % (uploader, slug_title))
2747
2748         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2749         request = compat_urllib_request.Request(streams_url)
2750         try:
2751             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2752             stream_json = stream_json_bytes.decode('utf-8')
2753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2755             return
2756
2757         streams = json.loads(stream_json)
2758         mediaURL = streams['http_mp3_128_url']
2759
2760         return [{
2761             'id':       info['id'],
2762             'url':      mediaURL,
2763             'uploader': info['user']['username'],
2764             'upload_date':  info['created_at'],
2765             'title':    info['title'],
2766             'ext':      u'mp3',
2767             'description': info['description'],
2768         }]
2769
2770
2771 class InfoQIE(InfoExtractor):
2772     """Information extractor for infoq.com"""
2773     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2774
2775     def report_extraction(self, video_id):
2776         """Report information extraction."""
2777         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2778
2779     def _real_extract(self, url):
2780         mobj = re.match(self._VALID_URL, url)
2781         if mobj is None:
2782             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2783             return
2784
2785         webpage = self._download_webpage(url, video_id=url)
2786         self.report_extraction(url)
2787
2788         # Extract video URL
2789         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2790         if mobj is None:
2791             self._downloader.trouble(u'ERROR: unable to extract video url')
2792             return
2793         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2794         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2795
2796         # Extract title
2797         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2798         if mobj is None:
2799             self._downloader.trouble(u'ERROR: unable to extract video title')
2800             return
2801         video_title = mobj.group(1)
2802
2803         # Extract description
2804         video_description = u'No description available.'
2805         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2806         if mobj is not None:
2807             video_description = mobj.group(1)
2808
2809         video_filename = video_url.split('/')[-1]
2810         video_id, extension = video_filename.split('.')
2811
2812         info = {
2813             'id': video_id,
2814             'url': video_url,
2815             'uploader': None,
2816             'upload_date': None,
2817             'title': video_title,
2818             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2819             'thumbnail': None,
2820             'description': video_description,
2821         }
2822
2823         return [info]
2824
2825 class MixcloudIE(InfoExtractor):
2826     """Information extractor for www.mixcloud.com"""
2827
2828     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2829     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2830     IE_NAME = u'mixcloud'
2831
2832     def __init__(self, downloader=None):
2833         InfoExtractor.__init__(self, downloader)
2834
2835     def report_download_json(self, file_id):
2836         """Report JSON download."""
2837         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2838
2839     def report_extraction(self, file_id):
2840         """Report information extraction."""
2841         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2842
2843     def get_urls(self, jsonData, fmt, bitrate='best'):
2844         """Get urls from 'audio_formats' section in json"""
2845         file_url = None
2846         try:
2847             bitrate_list = jsonData[fmt]
2848             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2849                 bitrate = max(bitrate_list) # select highest
2850
2851             url_list = jsonData[fmt][bitrate]
2852         except TypeError: # we have no bitrate info.
2853             url_list = jsonData[fmt]
2854         return url_list
2855
2856     def check_urls(self, url_list):
2857         """Returns 1st active url from list"""
2858         for url in url_list:
2859             try:
2860                 compat_urllib_request.urlopen(url)
2861                 return url
2862             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2863                 url = None
2864
2865         return None
2866
2867     def _print_formats(self, formats):
2868         print('Available formats:')
2869         for fmt in formats.keys():
2870             for b in formats[fmt]:
2871                 try:
2872                     ext = formats[fmt][b][0]
2873                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2874                 except TypeError: # we have no bitrate info
2875                     ext = formats[fmt][0]
2876                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2877                     break
2878
2879     def _real_extract(self, url):
2880         mobj = re.match(self._VALID_URL, url)
2881         if mobj is None:
2882             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2883             return
2884         # extract uploader & filename from url
2885         uploader = mobj.group(1).decode('utf-8')
2886         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2887
2888         # construct API request
2889         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2890         # retrieve .json file with links to files
2891         request = compat_urllib_request.Request(file_url)
2892         try:
2893             self.report_download_json(file_url)
2894             jsonData = compat_urllib_request.urlopen(request).read()
2895         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2897             return
2898
2899         # parse JSON
2900         json_data = json.loads(jsonData)
2901         player_url = json_data['player_swf_url']
2902         formats = dict(json_data['audio_formats'])
2903
2904         req_format = self._downloader.params.get('format', None)
2905         bitrate = None
2906
2907         if self._downloader.params.get('listformats', None):
2908             self._print_formats(formats)
2909             return
2910
2911         if req_format is None or req_format == 'best':
2912             for format_param in formats.keys():
2913                 url_list = self.get_urls(formats, format_param)
2914                 # check urls
2915                 file_url = self.check_urls(url_list)
2916                 if file_url is not None:
2917                     break # got it!
2918         else:
2919             if req_format not in formats:
2920                 self._downloader.trouble(u'ERROR: format is not available')
2921                 return
2922
2923             url_list = self.get_urls(formats, req_format)
2924             file_url = self.check_urls(url_list)
2925             format_param = req_format
2926
2927         return [{
2928             'id': file_id.decode('utf-8'),
2929             'url': file_url.decode('utf-8'),
2930             'uploader': uploader.decode('utf-8'),
2931             'upload_date': None,
2932             'title': json_data['name'],
2933             'ext': file_url.split('.')[-1].decode('utf-8'),
2934             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2935             'thumbnail': json_data['thumbnail_url'],
2936             'description': json_data['description'],
2937             'player_url': player_url.decode('utf-8'),
2938         }]
2939
2940 class StanfordOpenClassroomIE(InfoExtractor):
2941     """Information extractor for Stanford's Open ClassRoom"""
2942
2943     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2944     IE_NAME = u'stanfordoc'
2945
2946     def report_download_webpage(self, objid):
2947         """Report information extraction."""
2948         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2949
2950     def report_extraction(self, video_id):
2951         """Report information extraction."""
2952         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2953
2954     def _real_extract(self, url):
2955         mobj = re.match(self._VALID_URL, url)
2956         if mobj is None:
2957             raise ExtractorError(u'Invalid URL: %s' % url)
2958
2959         if mobj.group('course') and mobj.group('video'): # A specific video
2960             course = mobj.group('course')
2961             video = mobj.group('video')
2962             info = {
2963                 'id': course + '_' + video,
2964                 'uploader': None,
2965                 'upload_date': None,
2966             }
2967
2968             self.report_extraction(info['id'])
2969             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2970             xmlUrl = baseUrl + video + '.xml'
2971             try:
2972                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2973             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2974                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2975                 return
2976             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2977             try:
2978                 info['title'] = mdoc.findall('./title')[0].text
2979                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2980             except IndexError:
2981                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2982                 return
2983             info['ext'] = info['url'].rpartition('.')[2]
2984             return [info]
2985         elif mobj.group('course'): # A course page
2986             course = mobj.group('course')
2987             info = {
2988                 'id': course,
2989                 'type': 'playlist',
2990                 'uploader': None,
2991                 'upload_date': None,
2992             }
2993
2994             coursepage = self._download_webpage(url, info['id'],
2995                                         note='Downloading course info page',
2996                                         errnote='Unable to download course info page')
2997
2998             m = re.search('<h1>([^<]+)</h1>', coursepage)
2999             if m:
3000                 info['title'] = unescapeHTML(m.group(1))
3001             else:
3002                 info['title'] = info['id']
3003
3004             m = re.search('<description>([^<]+)</description>', coursepage)
3005             if m:
3006                 info['description'] = unescapeHTML(m.group(1))
3007
3008             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3009             info['list'] = [
3010                 {
3011                     'type': 'reference',
3012                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3013                 }
3014                     for vpage in links]
3015             results = []
3016             for entry in info['list']:
3017                 assert entry['type'] == 'reference'
3018                 results += self.extract(entry['url'])
3019             return results
3020         else: # Root page
3021             info = {
3022                 'id': 'Stanford OpenClassroom',
3023                 'type': 'playlist',
3024                 'uploader': None,
3025                 'upload_date': None,
3026             }
3027
3028             self.report_download_webpage(info['id'])
3029             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3030             try:
3031                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3032             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3033                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3034                 return
3035
3036             info['title'] = info['id']
3037
3038             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3039             info['list'] = [
3040                 {
3041                     'type': 'reference',
3042                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3043                 }
3044                     for cpage in links]
3045
3046             results = []
3047             for entry in info['list']:
3048                 assert entry['type'] == 'reference'
3049                 results += self.extract(entry['url'])
3050             return results
3051
3052 class MTVIE(InfoExtractor):
3053     """Information extractor for MTV.com"""
3054
3055     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3056     IE_NAME = u'mtv'
3057
3058     def report_extraction(self, video_id):
3059         """Report information extraction."""
3060         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3061
3062     def _real_extract(self, url):
3063         mobj = re.match(self._VALID_URL, url)
3064         if mobj is None:
3065             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3066             return
3067         if not mobj.group('proto'):
3068             url = 'http://' + url
3069         video_id = mobj.group('videoid')
3070
3071         webpage = self._download_webpage(url, video_id)
3072
3073         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3074         if mobj is None:
3075             self._downloader.trouble(u'ERROR: unable to extract song name')
3076             return
3077         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3078         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3079         if mobj is None:
3080             self._downloader.trouble(u'ERROR: unable to extract performer')
3081             return
3082         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3083         video_title = performer + ' - ' + song_name
3084
3085         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3086         if mobj is None:
3087             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3088             return
3089         mtvn_uri = mobj.group(1)
3090
3091         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3092         if mobj is None:
3093             self._downloader.trouble(u'ERROR: unable to extract content id')
3094             return
3095         content_id = mobj.group(1)
3096
3097         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3098         self.report_extraction(video_id)
3099         request = compat_urllib_request.Request(videogen_url)
3100         try:
3101             metadataXml = compat_urllib_request.urlopen(request).read()
3102         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3104             return
3105
3106         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3107         renditions = mdoc.findall('.//rendition')
3108
3109         # For now, always pick the highest quality.
3110         rendition = renditions[-1]
3111
3112         try:
3113             _,_,ext = rendition.attrib['type'].partition('/')
3114             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3115             video_url = rendition.find('./src').text
3116         except KeyError:
3117             self._downloader.trouble('Invalid rendition field.')
3118             return
3119
3120         info = {
3121             'id': video_id,
3122             'url': video_url,
3123             'uploader': performer,
3124             'upload_date': None,
3125             'title': video_title,
3126             'ext': ext,
3127             'format': format,
3128         }
3129
3130         return [info]
3131
3132
3133 class YoukuIE(InfoExtractor):
3134     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3135
3136     def report_download_webpage(self, file_id):
3137         """Report webpage download."""
3138         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3139
3140     def report_extraction(self, file_id):
3141         """Report information extraction."""
3142         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3143
3144     def _gen_sid(self):
3145         nowTime = int(time.time() * 1000)
3146         random1 = random.randint(1000,1998)
3147         random2 = random.randint(1000,9999)
3148
3149         return "%d%d%d" %(nowTime,random1,random2)
3150
3151     def _get_file_ID_mix_string(self, seed):
3152         mixed = []
3153         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3154         seed = float(seed)
3155         for i in range(len(source)):
3156             seed  =  (seed * 211 + 30031 ) % 65536
3157             index  =  math.floor(seed / 65536 * len(source) )
3158             mixed.append(source[int(index)])
3159             source.remove(source[int(index)])
3160         #return ''.join(mixed)
3161         return mixed
3162
3163     def _get_file_id(self, fileId, seed):
3164         mixed = self._get_file_ID_mix_string(seed)
3165         ids = fileId.split('*')
3166         realId = []
3167         for ch in ids:
3168             if ch:
3169                 realId.append(mixed[int(ch)])
3170         return ''.join(realId)
3171
3172     def _real_extract(self, url):
3173         mobj = re.match(self._VALID_URL, url)
3174         if mobj is None:
3175             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3176             return
3177         video_id = mobj.group('ID')
3178
3179         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3180
3181         request = compat_urllib_request.Request(info_url, None, std_headers)
3182         try:
3183             self.report_download_webpage(video_id)
3184             jsondata = compat_urllib_request.urlopen(request).read()
3185         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3186             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3187             return
3188
3189         self.report_extraction(video_id)
3190         try:
3191             jsonstr = jsondata.decode('utf-8')
3192             config = json.loads(jsonstr)
3193
3194             video_title =  config['data'][0]['title']
3195             seed = config['data'][0]['seed']
3196
3197             format = self._downloader.params.get('format', None)
3198             supported_format = list(config['data'][0]['streamfileids'].keys())
3199
3200             if format is None or format == 'best':
3201                 if 'hd2' in supported_format:
3202                     format = 'hd2'
3203                 else:
3204                     format = 'flv'
3205                 ext = u'flv'
3206             elif format == 'worst':
3207                 format = 'mp4'
3208                 ext = u'mp4'
3209             else:
3210                 format = 'flv'
3211                 ext = u'flv'
3212
3213
3214             fileid = config['data'][0]['streamfileids'][format]
3215             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3216         except (UnicodeDecodeError, ValueError, KeyError):
3217             self._downloader.trouble(u'ERROR: unable to extract info section')
3218             return
3219
3220         files_info=[]
3221         sid = self._gen_sid()
3222         fileid = self._get_file_id(fileid, seed)
3223
3224         #column 8,9 of fileid represent the segment number
3225         #fileid[7:9] should be changed
3226         for index, key in enumerate(keys):
3227
3228             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3229             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3230
3231             info = {
3232                 'id': '%s_part%02d' % (video_id, index),
3233                 'url': download_url,
3234                 'uploader': None,
3235                 'upload_date': None,
3236                 'title': video_title,
3237                 'ext': ext,
3238             }
3239             files_info.append(info)
3240
3241         return files_info
3242
3243
3244 class XNXXIE(InfoExtractor):
3245     """Information extractor for xnxx.com"""
3246
3247     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3248     IE_NAME = u'xnxx'
3249     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3250     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3251     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3252
3253     def report_webpage(self, video_id):
3254         """Report information extraction"""
3255         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3256
3257     def report_extraction(self, video_id):
3258         """Report information extraction"""
3259         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3260
3261     def _real_extract(self, url):
3262         mobj = re.match(self._VALID_URL, url)
3263         if mobj is None:
3264             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3265             return
3266         video_id = mobj.group(1)
3267
3268         self.report_webpage(video_id)
3269
3270         # Get webpage content
3271         try:
3272             webpage_bytes = compat_urllib_request.urlopen(url).read()
3273             webpage = webpage_bytes.decode('utf-8')
3274         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3275             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3276             return
3277
3278         result = re.search(self.VIDEO_URL_RE, webpage)
3279         if result is None:
3280             self._downloader.trouble(u'ERROR: unable to extract video url')
3281             return
3282         video_url = compat_urllib_parse.unquote(result.group(1))
3283
3284         result = re.search(self.VIDEO_TITLE_RE, webpage)
3285         if result is None:
3286             self._downloader.trouble(u'ERROR: unable to extract video title')
3287             return
3288         video_title = result.group(1)
3289
3290         result = re.search(self.VIDEO_THUMB_RE, webpage)
3291         if result is None:
3292             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3293             return
3294         video_thumbnail = result.group(1)
3295
3296         return [{
3297             'id': video_id,
3298             'url': video_url,
3299             'uploader': None,
3300             'upload_date': None,
3301             'title': video_title,
3302             'ext': 'flv',
3303             'thumbnail': video_thumbnail,
3304             'description': None,
3305         }]
3306
3307
3308 class GooglePlusIE(InfoExtractor):
3309     """Information extractor for plus.google.com."""
3310
3311     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3312     IE_NAME = u'plus.google'
3313
3314     def __init__(self, downloader=None):
3315         InfoExtractor.__init__(self, downloader)
3316
3317     def report_extract_entry(self, url):
3318         """Report downloading extry"""
3319         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3320
3321     def report_date(self, upload_date):
3322         """Report downloading extry"""
3323         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3324
3325     def report_uploader(self, uploader):
3326         """Report downloading extry"""
3327         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3328
3329     def report_title(self, video_title):
3330         """Report downloading extry"""
3331         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3332
3333     def report_extract_vid_page(self, video_page):
3334         """Report information extraction."""
3335         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3336
3337     def _real_extract(self, url):
3338         # Extract id from URL
3339         mobj = re.match(self._VALID_URL, url)
3340         if mobj is None:
3341             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3342             return
3343
3344         post_url = mobj.group(0)
3345         video_id = mobj.group(1)
3346
3347         video_extension = 'flv'
3348
3349         # Step 1, Retrieve post webpage to extract further information
3350         self.report_extract_entry(post_url)
3351         request = compat_urllib_request.Request(post_url)
3352         try:
3353             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3355             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3356             return
3357
3358         # Extract update date
3359         upload_date = None
3360         pattern = 'title="Timestamp">(.*?)</a>'
3361         mobj = re.search(pattern, webpage)
3362         if mobj:
3363             upload_date = mobj.group(1)
3364             # Convert timestring to a format suitable for filename
3365             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3366             upload_date = upload_date.strftime('%Y%m%d')
3367         self.report_date(upload_date)
3368
3369         # Extract uploader
3370         uploader = None
3371         pattern = r'rel\="author".*?>(.*?)</a>'
3372         mobj = re.search(pattern, webpage)
3373         if mobj:
3374             uploader = mobj.group(1)
3375         self.report_uploader(uploader)
3376
3377         # Extract title
3378         # Get the first line for title
3379         video_title = u'NA'
3380         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3381         mobj = re.search(pattern, webpage)
3382         if mobj:
3383             video_title = mobj.group(1)
3384         self.report_title(video_title)
3385
3386         # Step 2, Stimulate clicking the image box to launch video
3387         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3388         mobj = re.search(pattern, webpage)
3389         if mobj is None:
3390             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3391
3392         video_page = mobj.group(1)
3393         request = compat_urllib_request.Request(video_page)
3394         try:
3395             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3397             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3398             return
3399         self.report_extract_vid_page(video_page)
3400
3401
3402         # Extract video links on video page
3403         """Extract video links of all sizes"""
3404         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3405         mobj = re.findall(pattern, webpage)
3406         if len(mobj) == 0:
3407             self._downloader.trouble(u'ERROR: unable to extract video links')
3408
3409         # Sort in resolution
3410         links = sorted(mobj)
3411
3412         # Choose the lowest of the sort, i.e. highest resolution
3413         video_url = links[-1]
3414         # Only get the url. The resolution part in the tuple has no use anymore
3415         video_url = video_url[-1]
3416         # Treat escaped \u0026 style hex
3417         try:
3418             video_url = video_url.decode("unicode_escape")
3419         except AttributeError: # Python 3
3420             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3421
3422
3423         return [{
3424             'id':       video_id,
3425             'url':      video_url,
3426             'uploader': uploader,
3427             'upload_date':  upload_date,
3428             'title':    video_title,
3429             'ext':      video_extension,
3430         }]
3431
3432 class NBAIE(InfoExtractor):
3433     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3434     IE_NAME = u'nba'
3435
3436     def _real_extract(self, url):
3437         mobj = re.match(self._VALID_URL, url)
3438         if mobj is None:
3439             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3440             return
3441
3442         video_id = mobj.group(1)
3443         if video_id.endswith('/index.html'):
3444             video_id = video_id[:-len('/index.html')]
3445
3446         webpage = self._download_webpage(url, video_id)
3447
3448         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3449         def _findProp(rexp, default=None):
3450             m = re.search(rexp, webpage)
3451             if m:
3452                 return unescapeHTML(m.group(1))
3453             else:
3454                 return default
3455
3456         shortened_video_id = video_id.rpartition('/')[2]
3457         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3458         info = {
3459             'id': shortened_video_id,
3460             'url': video_url,
3461             'ext': 'mp4',
3462             'title': title,
3463             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3464             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3465         }
3466         return [info]
3467
3468 class JustinTVIE(InfoExtractor):
3469     """Information extractor for justin.tv and twitch.tv"""
3470     # TODO: One broadcast may be split into multiple videos. The key
3471     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3472     # starts at 1 and increases. Can we treat all parts as one video?
3473
3474     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3475         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3476     _JUSTIN_PAGE_LIMIT = 100
3477     IE_NAME = u'justin.tv'
3478
3479     def report_extraction(self, file_id):
3480         """Report information extraction."""
3481         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3482
3483     def report_download_page(self, channel, offset):
3484         """Report attempt to download a single page of videos."""
3485         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3486                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3487
3488     # Return count of items, list of *valid* items
3489     def _parse_page(self, url):
3490         try:
3491             urlh = compat_urllib_request.urlopen(url)
3492             webpage_bytes = urlh.read()
3493             webpage = webpage_bytes.decode('utf-8', 'ignore')
3494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3495             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3496             return
3497
3498         response = json.loads(webpage)
3499         if type(response) != list:
3500             error_text = response.get('error', 'unknown error')
3501             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3502             return
3503         info = []
3504         for clip in response:
3505             video_url = clip['video_file_url']
3506             if video_url:
3507                 video_extension = os.path.splitext(video_url)[1][1:]
3508                 video_date = re.sub('-', '', clip['start_time'][:10])
3509                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3510                 video_id = clip['id']
3511                 video_title = clip.get('title', video_id)
3512                 info.append({
3513                     'id': video_id,
3514                     'url': video_url,
3515                     'title': video_title,
3516                     'uploader': clip.get('channel_name', video_uploader_id),
3517                     'uploader_id': video_uploader_id,
3518                     'upload_date': video_date,
3519                     'ext': video_extension,
3520                 })
3521         return (len(response), info)
3522
3523     def _real_extract(self, url):
3524         mobj = re.match(self._VALID_URL, url)
3525         if mobj is None:
3526             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3527             return
3528
3529         api = 'http://api.justin.tv'
3530         video_id = mobj.group(mobj.lastindex)
3531         paged = False
3532         if mobj.lastindex == 1:
3533             paged = True
3534             api += '/channel/archives/%s.json'
3535         else:
3536             api += '/broadcast/by_archive/%s.json'
3537         api = api % (video_id,)
3538
3539         self.report_extraction(video_id)
3540
3541         info = []
3542         offset = 0
3543         limit = self._JUSTIN_PAGE_LIMIT
3544         while True:
3545             if paged:
3546                 self.report_download_page(video_id, offset)
3547             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3548             page_count, page_info = self._parse_page(page_url)
3549             info.extend(page_info)
3550             if not paged or page_count != limit:
3551                 break
3552             offset += limit
3553         return info
3554
3555 class FunnyOrDieIE(InfoExtractor):
3556     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3557
3558     def _real_extract(self, url):
3559         mobj = re.match(self._VALID_URL, url)
3560         if mobj is None:
3561             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3562             return
3563
3564         video_id = mobj.group('id')
3565         webpage = self._download_webpage(url, video_id)
3566
3567         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3568         if not m:
3569             self._downloader.trouble(u'ERROR: unable to find video information')
3570         video_url = unescapeHTML(m.group('url'))
3571
3572         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3573         if not m:
3574             self._downloader.trouble(u'Cannot find video title')
3575         title = unescapeHTML(m.group('title'))
3576
3577         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3578         if m:
3579             desc = unescapeHTML(m.group('desc'))
3580         else:
3581             desc = None
3582
3583         info = {
3584             'id': video_id,
3585             'url': video_url,
3586             'ext': 'mp4',
3587             'title': title,
3588             'description': desc,
3589         }
3590         return [info]
3591
3592 class SteamIE(InfoExtractor):
3593     _VALID_URL = r"""http://store.steampowered.com/
3594                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3595                 (?P<gameID>\d+)/?
3596                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3597                 """
3598
3599     @classmethod
3600     def suitable(cls, url):
3601         """Receives a URL and returns True if suitable for this IE."""
3602         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3603
3604     def _real_extract(self, url):
3605         m = re.match(self._VALID_URL, url, re.VERBOSE)
3606         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3607         gameID = m.group('gameID')
3608         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3609         webpage = self._download_webpage(videourl, gameID)
3610         mweb = re.finditer(urlRE, webpage)
3611         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3612         titles = re.finditer(namesRE, webpage)
3613         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3614         thumbs = re.finditer(thumbsRE, webpage)
3615         videos = []
3616         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3617             video_id = vid.group('videoID')
3618             title = vtitle.group('videoName')
3619             video_url = vid.group('videoURL')
3620             video_thumb = thumb.group('thumbnail')
3621             if not video_url:
3622                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3623             info = {
3624                 'id':video_id,
3625                 'url':video_url,
3626                 'ext': 'flv',
3627                 'title': unescapeHTML(title),
3628                 'thumbnail': video_thumb
3629                   }
3630             videos.append(info)
3631         return videos
3632
3633 class UstreamIE(InfoExtractor):
3634     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3635     IE_NAME = u'ustream'
3636
3637     def _real_extract(self, url):
3638         m = re.match(self._VALID_URL, url)
3639         video_id = m.group('videoID')
3640         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3641         webpage = self._download_webpage(url, video_id)
3642         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3643         title = m.group('title')
3644         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3645         uploader = m.group('uploader')
3646         info = {
3647                 'id':video_id,
3648                 'url':video_url,
3649                 'ext': 'flv',
3650                 'title': title,
3651                 'uploader': uploader
3652                   }
3653         return [info]
3654
3655 class RBMARadioIE(InfoExtractor):
3656     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3657
3658     def _real_extract(self, url):
3659         m = re.match(self._VALID_URL, url)
3660         video_id = m.group('videoID')
3661
3662         webpage = self._download_webpage(url, video_id)
3663         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3664         if not m:
3665             raise ExtractorError(u'Cannot find metadata')
3666         json_data = m.group(1)
3667
3668         try:
3669             data = json.loads(json_data)
3670         except ValueError as e:
3671             raise ExtractorError(u'Invalid JSON: ' + str(e))
3672
3673         video_url = data['akamai_url'] + '&cbr=256'
3674         url_parts = compat_urllib_parse_urlparse(video_url)
3675         video_ext = url_parts.path.rpartition('.')[2]
3676         info = {
3677                 'id': video_id,
3678                 'url': video_url,
3679                 'ext': video_ext,
3680                 'title': data['title'],
3681                 'description': data.get('teaser_text'),
3682                 'location': data.get('country_of_origin'),
3683                 'uploader': data.get('host', {}).get('name'),
3684                 'uploader_id': data.get('host', {}).get('slug'),
3685                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3686                 'duration': data.get('duration'),
3687         }
3688         return [info]
3689
3690
3691 class YouPornIE(InfoExtractor):
3692     """Information extractor for youporn.com."""
3693     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3694
3695     def _print_formats(self, formats):
3696         """Print all available formats"""
3697         print(u'Available formats:')
3698         print(u'ext\t\tformat')
3699         print(u'---------------------------------')
3700         for format in formats:
3701             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3702
3703     def _specific(self, req_format, formats):
3704         for x in formats:
3705             if(x["format"]==req_format):
3706                 return x
3707         return None
3708
3709     def _real_extract(self, url):
3710         mobj = re.match(self._VALID_URL, url)
3711         if mobj is None:
3712             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3713             return
3714
3715         video_id = mobj.group('videoid')
3716
3717         req = compat_urllib_request.Request(url)
3718         req.add_header('Cookie', 'age_verified=1')
3719         webpage = self._download_webpage(req, video_id)
3720
3721         # Get the video title
3722         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3723         if result is None:
3724             raise ExtractorError(u'Unable to extract video title')
3725         video_title = result.group('title').strip()
3726
3727         # Get the video date
3728         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3729         if result is None:
3730             self._downloader.report_warning(u'unable to extract video date')
3731             upload_date = None
3732         else:
3733             upload_date = result.group('date').strip()
3734
3735         # Get the video uploader
3736         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3737         if result is None:
3738             self._downloader.report_warning(u'unable to extract uploader')
3739             video_uploader = None
3740         else:
3741             video_uploader = result.group('uploader').strip()
3742             video_uploader = clean_html( video_uploader )
3743
3744         # Get all of the formats available
3745         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3746         result = re.search(DOWNLOAD_LIST_RE, webpage)
3747         if result is None:
3748             raise ExtractorError(u'Unable to extract download list')
3749         download_list_html = result.group('download_list').strip()
3750
3751         # Get all of the links from the page
3752         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3753         links = re.findall(LINK_RE, download_list_html)
3754         if(len(links) == 0):
3755             raise ExtractorError(u'ERROR: no known formats available for video')
3756
3757         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3758
3759         formats = []
3760         for link in links:
3761
3762             # A link looks like this:
3763             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3764             # A path looks like this:
3765             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3766             video_url = unescapeHTML( link )
3767             path = compat_urllib_parse_urlparse( video_url ).path
3768             extension = os.path.splitext( path )[1][1:]
3769             format = path.split('/')[4].split('_')[:2]
3770             size = format[0]
3771             bitrate = format[1]
3772             format = "-".join( format )
3773             title = u'%s-%s-%s' % (video_title, size, bitrate)
3774
3775             formats.append({
3776                 'id': video_id,
3777                 'url': video_url,
3778                 'uploader': video_uploader,
3779                 'upload_date': upload_date,
3780                 'title': title,
3781                 'ext': extension,
3782                 'format': format,
3783                 'thumbnail': None,
3784                 'description': None,
3785                 'player_url': None
3786             })
3787
3788         if self._downloader.params.get('listformats', None):
3789             self._print_formats(formats)
3790             return
3791
3792         req_format = self._downloader.params.get('format', None)
3793         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3794
3795         if req_format is None or req_format == 'best':
3796             return [formats[0]]
3797         elif req_format == 'worst':
3798             return [formats[-1]]
3799         elif req_format in ('-1', 'all'):
3800             return formats
3801         else:
3802             format = self._specific( req_format, formats )
3803             if result is None:
3804                 self._downloader.trouble(u'ERROR: requested format not available')
3805                 return
3806             return [format]
3807
3808
3809
3810 class PornotubeIE(InfoExtractor):
3811     """Information extractor for pornotube.com."""
3812     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3813
3814     def _real_extract(self, url):
3815         mobj = re.match(self._VALID_URL, url)
3816         if mobj is None:
3817             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3818             return
3819
3820         video_id = mobj.group('videoid')
3821         video_title = mobj.group('title')
3822
3823         # Get webpage content
3824         webpage = self._download_webpage(url, video_id)
3825
3826         # Get the video URL
3827         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3828         result = re.search(VIDEO_URL_RE, webpage)
3829         if result is None:
3830             self._downloader.trouble(u'ERROR: unable to extract video url')
3831             return
3832         video_url = compat_urllib_parse.unquote(result.group('url'))
3833
3834         #Get the uploaded date
3835         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3836         result = re.search(VIDEO_UPLOADED_RE, webpage)
3837         if result is None:
3838             self._downloader.trouble(u'ERROR: unable to extract video title')
3839             return
3840         upload_date = result.group('date')
3841
3842         info = {'id': video_id,
3843                 'url': video_url,
3844                 'uploader': None,
3845                 'upload_date': upload_date,
3846                 'title': video_title,
3847                 'ext': 'flv',
3848                 'format': 'flv'}
3849
3850         return [info]
3851
3852 class YouJizzIE(InfoExtractor):
3853     """Information extractor for youjizz.com."""
3854     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3855
3856     def _real_extract(self, url):
3857         mobj = re.match(self._VALID_URL, url)
3858         if mobj is None:
3859             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3860             return
3861
3862         video_id = mobj.group('videoid')
3863
3864         # Get webpage content
3865         webpage = self._download_webpage(url, video_id)
3866
3867         # Get the video title
3868         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3869         if result is None:
3870             raise ExtractorError(u'ERROR: unable to extract video title')
3871         video_title = result.group('title').strip()
3872
3873         # Get the embed page
3874         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3875         if result is None:
3876             raise ExtractorError(u'ERROR: unable to extract embed page')
3877
3878         embed_page_url = result.group(0).strip()
3879         video_id = result.group('videoid')
3880
3881         webpage = self._download_webpage(embed_page_url, video_id)
3882
3883         # Get the video URL
3884         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3885         if result is None:
3886             raise ExtractorError(u'ERROR: unable to extract video url')
3887         video_url = result.group('source')
3888
3889         info = {'id': video_id,
3890                 'url': video_url,
3891                 'title': video_title,
3892                 'ext': 'flv',
3893                 'format': 'flv',
3894                 'player_url': embed_page_url}
3895
3896         return [info]
3897
3898 class EightTracksIE(InfoExtractor):
3899     IE_NAME = '8tracks'
3900     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3901
3902     def _real_extract(self, url):
3903         mobj = re.match(self._VALID_URL, url)
3904         if mobj is None:
3905             raise ExtractorError(u'Invalid URL: %s' % url)
3906         playlist_id = mobj.group('id')
3907
3908         webpage = self._download_webpage(url, playlist_id)
3909
3910         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3911         if not m:
3912             raise ExtractorError(u'Cannot find trax information')
3913         json_like = m.group(1)
3914         data = json.loads(json_like)
3915
3916         session = str(random.randint(0, 1000000000))
3917         mix_id = data['id']
3918         track_count = data['tracks_count']
3919         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3920         next_url = first_url
3921         res = []
3922         for i in itertools.count():
3923             api_json = self._download_webpage(next_url, playlist_id,
3924                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3925                 errnote=u'Failed to download song information')
3926             api_data = json.loads(api_json)
3927             track_data = api_data[u'set']['track']
3928             info = {
3929                 'id': track_data['id'],
3930                 'url': track_data['track_file_stream_url'],
3931                 'title': track_data['performer'] + u' - ' + track_data['name'],
3932                 'raw_title': track_data['name'],
3933                 'uploader_id': data['user']['login'],
3934                 'ext': 'm4a',
3935             }
3936             res.append(info)
3937             if api_data['set']['at_last_track']:
3938                 break
3939             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3940         return res
3941
3942 class KeekIE(InfoExtractor):
3943     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3944     IE_NAME = u'keek'
3945
3946     def _real_extract(self, url):
3947         m = re.match(self._VALID_URL, url)
3948         video_id = m.group('videoID')
3949         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3950         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3951         webpage = self._download_webpage(url, video_id)
3952         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3953         title = unescapeHTML(m.group('title'))
3954         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3955         uploader = unescapeHTML(m.group('uploader'))
3956         info = {
3957                 'id':video_id,
3958                 'url':video_url,
3959                 'ext': 'mp4',
3960                 'title': title,
3961                 'thumbnail': thumbnail,
3962                 'uploader': uploader
3963         }
3964         return [info]
3965
3966 class TEDIE(InfoExtractor):
3967     _VALID_URL=r'''http://www.ted.com/
3968                    (
3969                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3970                         |
3971                         ((?P<type_talk>talks)) # We have a simple talk
3972                    )
3973                    /(?P<name>\w+) # Here goes the name and then ".html"
3974                    '''
3975
3976     @classmethod
3977     def suitable(cls, url):
3978         """Receives a URL and returns True if suitable for this IE."""
3979         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3980
3981     def _real_extract(self, url):
3982         m=re.match(self._VALID_URL, url, re.VERBOSE)
3983         if m.group('type_talk'):
3984             return [self._talk_info(url)]
3985         else :
3986             playlist_id=m.group('playlist_id')
3987             name=m.group('name')
3988             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3989             return self._playlist_videos_info(url,name,playlist_id)
3990
3991     def _talk_video_link(self,mediaSlug):
3992         '''Returns the video link for that mediaSlug'''
3993         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3994
3995     def _playlist_videos_info(self,url,name,playlist_id=0):
3996         '''Returns the videos of the playlist'''
3997         video_RE=r'''
3998                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3999                      ([.\s]*?)data-playlist_item_id="(\d+)"
4000                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4001                      '''
4002         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4003         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4004         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4005         m_names=re.finditer(video_name_RE,webpage)
4006         info=[]
4007         for m_video, m_name in zip(m_videos,m_names):
4008             video_id=m_video.group('video_id')
4009             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4010             info.append(self._talk_info(talk_url,video_id))
4011         return info
4012
4013     def _talk_info(self, url, video_id=0):
4014         """Return the video for the talk in the url"""
4015         m=re.match(self._VALID_URL, url,re.VERBOSE)
4016         videoName=m.group('name')
4017         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4018         # If the url includes the language we get the title translated
4019         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4020         title=re.search(title_RE, webpage).group('title')
4021         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4022                         "id":(?P<videoID>[\d]+).*?
4023                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4024         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4025         thumb_match=re.search(thumb_RE,webpage)
4026         info_match=re.search(info_RE,webpage,re.VERBOSE)
4027         video_id=info_match.group('videoID')
4028         mediaSlug=info_match.group('mediaSlug')
4029         video_url=self._talk_video_link(mediaSlug)
4030         info = {
4031                 'id': video_id,
4032                 'url': video_url,
4033                 'ext': 'mp4',
4034                 'title': title,
4035                 'thumbnail': thumb_match.group('thumbnail')
4036                 }
4037         return info
4038
4039 class MySpassIE(InfoExtractor):
4040     _VALID_URL = r'http://www.myspass.de/.*'
4041
4042     def _real_extract(self, url):
4043         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4044
4045         # video id is the last path element of the URL
4046         # usually there is a trailing slash, so also try the second but last
4047         url_path = compat_urllib_parse_urlparse(url).path
4048         url_parent_path, video_id = os.path.split(url_path)
4049         if not video_id:
4050             _, video_id = os.path.split(url_parent_path)
4051
4052         # get metadata
4053         metadata_url = META_DATA_URL_TEMPLATE % video_id
4054         metadata_text = self._download_webpage(metadata_url, video_id)
4055         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4056
4057         # extract values from metadata
4058         url_flv_el = metadata.find('url_flv')
4059         if url_flv_el is None:
4060             self._downloader.trouble(u'ERROR: unable to extract download url')
4061             return
4062         video_url = url_flv_el.text
4063         extension = os.path.splitext(video_url)[1][1:]
4064         title_el = metadata.find('title')
4065         if title_el is None:
4066             self._downloader.trouble(u'ERROR: unable to extract title')
4067             return
4068         title = title_el.text
4069         format_id_el = metadata.find('format_id')
4070         if format_id_el is None:
4071             format = ext
4072         else:
4073             format = format_id_el.text
4074         description_el = metadata.find('description')
4075         if description_el is not None:
4076             description = description_el.text
4077         else:
4078             description = None
4079         imagePreview_el = metadata.find('imagePreview')
4080         if imagePreview_el is not None:
4081             thumbnail = imagePreview_el.text
4082         else:
4083             thumbnail = None
4084         info = {
4085             'id': video_id,
4086             'url': video_url,
4087             'title': title,
4088             'ext': extension,
4089             'format': format,
4090             'thumbnail': thumbnail,
4091             'description': description
4092         }
4093         return [info]
4094
4095 def gen_extractors():
4096     """ Return a list of an instance of every supported extractor.
4097     The order does matter; the first extractor matched is the one handling the URL.
4098     """
4099     return [
4100         YoutubePlaylistIE(),
4101         YoutubeChannelIE(),
4102         YoutubeUserIE(),
4103         YoutubeSearchIE(),
4104         YoutubeIE(),
4105         MetacafeIE(),
4106         DailymotionIE(),
4107         GoogleSearchIE(),
4108         PhotobucketIE(),
4109         YahooIE(),
4110         YahooSearchIE(),
4111         DepositFilesIE(),
4112         FacebookIE(),
4113         BlipTVUserIE(),
4114         BlipTVIE(),
4115         VimeoIE(),
4116         MyVideoIE(),
4117         ComedyCentralIE(),
4118         EscapistIE(),
4119         CollegeHumorIE(),
4120         XVideosIE(),
4121         SoundcloudIE(),
4122         InfoQIE(),
4123         MixcloudIE(),
4124         StanfordOpenClassroomIE(),
4125         MTVIE(),
4126         YoukuIE(),
4127         XNXXIE(),
4128         YouJizzIE(),
4129         PornotubeIE(),
4130         YouPornIE(),
4131         GooglePlusIE(),
4132         ArteTvIE(),
4133         NBAIE(),
4134         JustinTVIE(),
4135         FunnyOrDieIE(),
4136         SteamIE(),
4137         UstreamIE(),
4138         RBMARadioIE(),
4139         EightTracksIE(),
4140         KeekIE(),
4141         TEDIE(),
4142         MySpassIE(),
4143         GenericIE()
4144     ]
4145
4146