Spiegel: Support hash at end of URL
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The .srt file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
228
229     def report_information_extraction(self, video_id):
230         """Report attempt to extract video information."""
231         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
232
233     def report_unavailable_format(self, video_id, format):
234         """Report extracted video URL."""
235         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
236
237     def report_rtmp_download(self):
238         """Indicate the download will use the RTMP protocol."""
239         self._downloader.to_screen(u'[youtube] RTMP download detected')
240
241     def _closed_captions_xml_to_srt(self, xml_string):
242         srt = ''
243         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
244         # TODO parse xml instead of regex
245         for n, (start, dur_tag, dur, caption) in enumerate(texts):
246             if not dur: dur = '4'
247             start = float(start)
248             end = start + float(dur)
249             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
250             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
251             caption = unescapeHTML(caption)
252             caption = unescapeHTML(caption) # double cycle, intentional
253             srt += str(n+1) + '\n'
254             srt += start + ' --> ' + end + '\n'
255             srt += caption + '\n\n'
256         return srt
257
258     def _extract_subtitles(self, video_id):
259         self.report_video_subtitles_download(video_id)
260         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
261         try:
262             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
266         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
267         if not srt_lang_list:
268             return (u'WARNING: video has no closed captions', None)
269         if self._downloader.params.get('subtitleslang', False):
270             srt_lang = self._downloader.params.get('subtitleslang')
271         elif 'en' in srt_lang_list:
272             srt_lang = 'en'
273         else:
274             srt_lang = list(srt_lang_list.keys())[0]
275         if not srt_lang in srt_lang_list:
276             return (u'WARNING: no closed captions found in the specified language', None)
277         params = compat_urllib_parse.urlencode({
278             'lang': srt_lang,
279             'name': srt_lang_list[srt_lang].encode('utf-8'),
280             'v': video_id,
281         })
282         url = 'http://www.youtube.com/api/timedtext?' + params
283         try:
284             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
285         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
286             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
287         if not srt_xml:
288             return (u'WARNING: Did not fetch video subtitles', None)
289         return (None, self._closed_captions_xml_to_srt(srt_xml))
290
291     def _print_formats(self, formats):
292         print('Available formats:')
293         for x in formats:
294             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
295
296     def _real_initialize(self):
297         if self._downloader is None:
298             return
299
300         username = None
301         password = None
302         downloader_params = self._downloader.params
303
304         # Attempt to use provided username and password or .netrc data
305         if downloader_params.get('username', None) is not None:
306             username = downloader_params['username']
307             password = downloader_params['password']
308         elif downloader_params.get('usenetrc', False):
309             try:
310                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
311                 if info is not None:
312                     username = info[0]
313                     password = info[2]
314                 else:
315                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
316             except (IOError, netrc.NetrcParseError) as err:
317                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
318                 return
319
320         # Set language
321         request = compat_urllib_request.Request(self._LANG_URL)
322         try:
323             self.report_lang()
324             compat_urllib_request.urlopen(request).read()
325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
326             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
327             return
328
329         # No authentication to be performed
330         if username is None:
331             return
332
333         request = compat_urllib_request.Request(self._LOGIN_URL)
334         try:
335             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
336         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
337             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
338             return
339
340         galx = None
341         dsh = None
342         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
343         if match:
344           galx = match.group(1)
345
346         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
347         if match:
348           dsh = match.group(1)
349
350         # Log in
351         login_form_strs = {
352                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
353                 u'Email': username,
354                 u'GALX': galx,
355                 u'Passwd': password,
356                 u'PersistentCookie': u'yes',
357                 u'_utf8': u'霱',
358                 u'bgresponse': u'js_disabled',
359                 u'checkConnection': u'',
360                 u'checkedDomains': u'youtube',
361                 u'dnConn': u'',
362                 u'dsh': dsh,
363                 u'pstMsg': u'0',
364                 u'rmShown': u'1',
365                 u'secTok': u'',
366                 u'signIn': u'Sign in',
367                 u'timeStmp': u'',
368                 u'service': u'youtube',
369                 u'uilel': u'3',
370                 u'hl': u'en_US',
371         }
372         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
373         # chokes on unicode
374         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
375         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
376         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
377         try:
378             self.report_login()
379             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
380             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
381                 self._downloader.report_warning(u'unable to log in: bad username or password')
382                 return
383         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
384             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
385             return
386
387         # Confirm age
388         age_form = {
389                 'next_url':     '/',
390                 'action_confirm':   'Confirm',
391                 }
392         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
393         try:
394             self.report_age_confirmation()
395             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
398             return
399
400     def _extract_id(self, url):
401         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
402         if mobj is None:
403             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
404             return
405         video_id = mobj.group(2)
406         return video_id
407
408     def _real_extract(self, url):
409         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
410         mobj = re.search(self._NEXT_URL_RE, url)
411         if mobj:
412             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
413         video_id = self._extract_id(url)
414
415         # Get video webpage
416         self.report_video_webpage_download(video_id)
417         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
418         request = compat_urllib_request.Request(url)
419         try:
420             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
423             return
424
425         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
426
427         # Attempt to extract SWF player URL
428         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
429         if mobj is not None:
430             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
431         else:
432             player_url = None
433
434         # Get video info
435         self.report_video_info_webpage_download(video_id)
436         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
437             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
438                     % (video_id, el_type))
439             request = compat_urllib_request.Request(video_info_url)
440             try:
441                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
442                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
443                 video_info = compat_parse_qs(video_info_webpage)
444                 if 'token' in video_info:
445                     break
446             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
447                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
448                 return
449         if 'token' not in video_info:
450             if 'reason' in video_info:
451                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
452             else:
453                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
454             return
455
456         # Check for "rental" videos
457         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
458             self._downloader.trouble(u'ERROR: "rental" videos not supported')
459             return
460
461         # Start extracting information
462         self.report_information_extraction(video_id)
463
464         # uploader
465         if 'author' not in video_info:
466             self._downloader.trouble(u'ERROR: unable to extract uploader name')
467             return
468         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
469
470         # uploader_id
471         video_uploader_id = None
472         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
473         if mobj is not None:
474             video_uploader_id = mobj.group(1)
475         else:
476             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
477
478         # title
479         if 'title' not in video_info:
480             self._downloader.trouble(u'ERROR: unable to extract video title')
481             return
482         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
483
484         # thumbnail image
485         if 'thumbnail_url' not in video_info:
486             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
487             video_thumbnail = ''
488         else:   # don't panic if we can't find it
489             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
490
491         # upload date
492         upload_date = None
493         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
494         if mobj is not None:
495             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
496             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
497             for expression in format_expressions:
498                 try:
499                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
500                 except:
501                     pass
502
503         # description
504         video_description = get_element_by_id("eow-description", video_webpage)
505         if video_description:
506             video_description = clean_html(video_description)
507         else:
508             video_description = ''
509
510         # closed captions
511         video_subtitles = None
512         if self._downloader.params.get('writesubtitles', False):
513             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
514             if srt_error:
515                 self._downloader.trouble(srt_error)
516
517         if 'length_seconds' not in video_info:
518             self._downloader.trouble(u'WARNING: unable to extract video duration')
519             video_duration = ''
520         else:
521             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
522
523         # token
524         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
525
526         # Decide which formats to download
527         req_format = self._downloader.params.get('format', None)
528
529         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
530             self.report_rtmp_download()
531             video_url_list = [(None, video_info['conn'][0])]
532         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
533             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
534             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
535             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
536             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
537
538             format_limit = self._downloader.params.get('format_limit', None)
539             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
540             if format_limit is not None and format_limit in available_formats:
541                 format_list = available_formats[available_formats.index(format_limit):]
542             else:
543                 format_list = available_formats
544             existing_formats = [x for x in format_list if x in url_map]
545             if len(existing_formats) == 0:
546                 self._downloader.trouble(u'ERROR: no known formats available for video')
547                 return
548             if self._downloader.params.get('listformats', None):
549                 self._print_formats(existing_formats)
550                 return
551             if req_format is None or req_format == 'best':
552                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
553             elif req_format == 'worst':
554                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
555             elif req_format in ('-1', 'all'):
556                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
557             else:
558                 # Specific formats. We pick the first in a slash-delimeted sequence.
559                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
560                 req_formats = req_format.split('/')
561                 video_url_list = None
562                 for rf in req_formats:
563                     if rf in url_map:
564                         video_url_list = [(rf, url_map[rf])]
565                         break
566                 if video_url_list is None:
567                     self._downloader.trouble(u'ERROR: requested format not available')
568                     return
569         else:
570             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
571             return
572
573         results = []
574         for format_param, video_real_url in video_url_list:
575             # Extension
576             video_extension = self._video_extensions.get(format_param, 'flv')
577
578             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
579                                               self._video_dimensions.get(format_param, '???'))
580
581             results.append({
582                 'id':       video_id,
583                 'url':      video_real_url,
584                 'uploader': video_uploader,
585                 'uploader_id': video_uploader_id,
586                 'upload_date':  upload_date,
587                 'title':    video_title,
588                 'ext':      video_extension,
589                 'format':   video_format,
590                 'thumbnail':    video_thumbnail,
591                 'description':  video_description,
592                 'player_url':   player_url,
593                 'subtitles':    video_subtitles,
594                 'duration':     video_duration
595             })
596         return results
597
598
599 class MetacafeIE(InfoExtractor):
600     """Information Extractor for metacafe.com."""
601
602     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
603     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
604     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
605     IE_NAME = u'metacafe'
606
607     def __init__(self, downloader=None):
608         InfoExtractor.__init__(self, downloader)
609
610     def report_disclaimer(self):
611         """Report disclaimer retrieval."""
612         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
613
614     def report_age_confirmation(self):
615         """Report attempt to confirm age."""
616         self._downloader.to_screen(u'[metacafe] Confirming age')
617
618     def report_download_webpage(self, video_id):
619         """Report webpage download."""
620         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
621
622     def report_extraction(self, video_id):
623         """Report information extraction."""
624         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
625
626     def _real_initialize(self):
627         # Retrieve disclaimer
628         request = compat_urllib_request.Request(self._DISCLAIMER)
629         try:
630             self.report_disclaimer()
631             disclaimer = compat_urllib_request.urlopen(request).read()
632         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
633             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
634             return
635
636         # Confirm age
637         disclaimer_form = {
638             'filters': '0',
639             'submit': "Continue - I'm over 18",
640             }
641         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
642         try:
643             self.report_age_confirmation()
644             disclaimer = compat_urllib_request.urlopen(request).read()
645         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
646             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
647             return
648
649     def _real_extract(self, url):
650         # Extract id and simplified title from URL
651         mobj = re.match(self._VALID_URL, url)
652         if mobj is None:
653             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
654             return
655
656         video_id = mobj.group(1)
657
658         # Check if video comes from YouTube
659         mobj2 = re.match(r'^yt-(.*)$', video_id)
660         if mobj2 is not None:
661             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
662             return
663
664         # Retrieve video webpage to extract further information
665         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
666         try:
667             self.report_download_webpage(video_id)
668             webpage = compat_urllib_request.urlopen(request).read()
669         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
670             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
671             return
672
673         # Extract URL, uploader and title from webpage
674         self.report_extraction(video_id)
675         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
676         if mobj is not None:
677             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
678             video_extension = mediaURL[-3:]
679
680             # Extract gdaKey if available
681             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
682             if mobj is None:
683                 video_url = mediaURL
684             else:
685                 gdaKey = mobj.group(1)
686                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
687         else:
688             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
689             if mobj is None:
690                 self._downloader.trouble(u'ERROR: unable to extract media URL')
691                 return
692             vardict = compat_parse_qs(mobj.group(1))
693             if 'mediaData' not in vardict:
694                 self._downloader.trouble(u'ERROR: unable to extract media URL')
695                 return
696             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
697             if mobj is None:
698                 self._downloader.trouble(u'ERROR: unable to extract media URL')
699                 return
700             mediaURL = mobj.group(1).replace('\\/', '/')
701             video_extension = mediaURL[-3:]
702             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
703
704         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
705         if mobj is None:
706             self._downloader.trouble(u'ERROR: unable to extract title')
707             return
708         video_title = mobj.group(1).decode('utf-8')
709
710         mobj = re.search(r'submitter=(.*?);', webpage)
711         if mobj is None:
712             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
713             return
714         video_uploader = mobj.group(1)
715
716         return [{
717             'id':       video_id.decode('utf-8'),
718             'url':      video_url.decode('utf-8'),
719             'uploader': video_uploader.decode('utf-8'),
720             'upload_date':  None,
721             'title':    video_title,
722             'ext':      video_extension.decode('utf-8'),
723         }]
724
725
726 class DailymotionIE(InfoExtractor):
727     """Information Extractor for Dailymotion"""
728
729     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
730     IE_NAME = u'dailymotion'
731     _WORKING = False
732
733     def __init__(self, downloader=None):
734         InfoExtractor.__init__(self, downloader)
735
736     def report_extraction(self, video_id):
737         """Report information extraction."""
738         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
739
740     def _real_extract(self, url):
741         # Extract id and simplified title from URL
742         mobj = re.match(self._VALID_URL, url)
743         if mobj is None:
744             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
745             return
746
747         video_id = mobj.group(1).split('_')[0].split('?')[0]
748
749         video_extension = 'mp4'
750
751         # Retrieve video webpage to extract further information
752         request = compat_urllib_request.Request(url)
753         request.add_header('Cookie', 'family_filter=off')
754         webpage = self._download_webpage(request, video_id)
755
756         # Extract URL, uploader and title from webpage
757         self.report_extraction(video_id)
758         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
759         if mobj is None:
760             self._downloader.trouble(u'ERROR: unable to extract media URL')
761             return
762         flashvars = compat_urllib_parse.unquote(mobj.group(1))
763
764         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
765             if key in flashvars:
766                 max_quality = key
767                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
768                 break
769         else:
770             self._downloader.trouble(u'ERROR: unable to extract video URL')
771             return
772
773         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
774         if mobj is None:
775             self._downloader.trouble(u'ERROR: unable to extract video URL')
776             return
777
778         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
779
780         # TODO: support choosing qualities
781
782         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
783         if mobj is None:
784             self._downloader.trouble(u'ERROR: unable to extract title')
785             return
786         video_title = unescapeHTML(mobj.group('title'))
787
788         video_uploader = None
789         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
790         if mobj is None:
791             # lookin for official user
792             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
793             if mobj_official is None:
794                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
795             else:
796                 video_uploader = mobj_official.group(1)
797         else:
798             video_uploader = mobj.group(1)
799
800         video_upload_date = None
801         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
802         if mobj is not None:
803             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
804
805         return [{
806             'id':       video_id,
807             'url':      video_url,
808             'uploader': video_uploader,
809             'upload_date':  video_upload_date,
810             'title':    video_title,
811             'ext':      video_extension,
812         }]
813
814
815 class PhotobucketIE(InfoExtractor):
816     """Information extractor for photobucket.com."""
817
818     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
819     IE_NAME = u'photobucket'
820
821     def __init__(self, downloader=None):
822         InfoExtractor.__init__(self, downloader)
823
824     def report_download_webpage(self, video_id):
825         """Report webpage download."""
826         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
827
828     def report_extraction(self, video_id):
829         """Report information extraction."""
830         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
831
832     def _real_extract(self, url):
833         # Extract id from URL
834         mobj = re.match(self._VALID_URL, url)
835         if mobj is None:
836             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
837             return
838
839         video_id = mobj.group(1)
840
841         video_extension = 'flv'
842
843         # Retrieve video webpage to extract further information
844         request = compat_urllib_request.Request(url)
845         try:
846             self.report_download_webpage(video_id)
847             webpage = compat_urllib_request.urlopen(request).read()
848         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
849             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
850             return
851
852         # Extract URL, uploader, and title from webpage
853         self.report_extraction(video_id)
854         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
855         if mobj is None:
856             self._downloader.trouble(u'ERROR: unable to extract media URL')
857             return
858         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
859
860         video_url = mediaURL
861
862         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
863         if mobj is None:
864             self._downloader.trouble(u'ERROR: unable to extract title')
865             return
866         video_title = mobj.group(1).decode('utf-8')
867
868         video_uploader = mobj.group(2).decode('utf-8')
869
870         return [{
871             'id':       video_id.decode('utf-8'),
872             'url':      video_url.decode('utf-8'),
873             'uploader': video_uploader,
874             'upload_date':  None,
875             'title':    video_title,
876             'ext':      video_extension.decode('utf-8'),
877         }]
878
879
880 class YahooIE(InfoExtractor):
881     """Information extractor for video.yahoo.com."""
882
883     _WORKING = False
884     # _VALID_URL matches all Yahoo! Video URLs
885     # _VPAGE_URL matches only the extractable '/watch/' URLs
886     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
887     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
888     IE_NAME = u'video.yahoo'
889
890     def __init__(self, downloader=None):
891         InfoExtractor.__init__(self, downloader)
892
893     def report_download_webpage(self, video_id):
894         """Report webpage download."""
895         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
896
897     def report_extraction(self, video_id):
898         """Report information extraction."""
899         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
900
901     def _real_extract(self, url, new_video=True):
902         # Extract ID from URL
903         mobj = re.match(self._VALID_URL, url)
904         if mobj is None:
905             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
906             return
907
908         video_id = mobj.group(2)
909         video_extension = 'flv'
910
911         # Rewrite valid but non-extractable URLs as
912         # extractable English language /watch/ URLs
913         if re.match(self._VPAGE_URL, url) is None:
914             request = compat_urllib_request.Request(url)
915             try:
916                 webpage = compat_urllib_request.urlopen(request).read()
917             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
919                 return
920
921             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
922             if mobj is None:
923                 self._downloader.trouble(u'ERROR: Unable to extract id field')
924                 return
925             yahoo_id = mobj.group(1)
926
927             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
928             if mobj is None:
929                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
930                 return
931             yahoo_vid = mobj.group(1)
932
933             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
934             return self._real_extract(url, new_video=False)
935
936         # Retrieve video webpage to extract further information
937         request = compat_urllib_request.Request(url)
938         try:
939             self.report_download_webpage(video_id)
940             webpage = compat_urllib_request.urlopen(request).read()
941         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
943             return
944
945         # Extract uploader and title from webpage
946         self.report_extraction(video_id)
947         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
948         if mobj is None:
949             self._downloader.trouble(u'ERROR: unable to extract video title')
950             return
951         video_title = mobj.group(1).decode('utf-8')
952
953         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
954         if mobj is None:
955             self._downloader.trouble(u'ERROR: unable to extract video uploader')
956             return
957         video_uploader = mobj.group(1).decode('utf-8')
958
959         # Extract video thumbnail
960         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
961         if mobj is None:
962             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
963             return
964         video_thumbnail = mobj.group(1).decode('utf-8')
965
966         # Extract video description
967         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
968         if mobj is None:
969             self._downloader.trouble(u'ERROR: unable to extract video description')
970             return
971         video_description = mobj.group(1).decode('utf-8')
972         if not video_description:
973             video_description = 'No description available.'
974
975         # Extract video height and width
976         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
977         if mobj is None:
978             self._downloader.trouble(u'ERROR: unable to extract video height')
979             return
980         yv_video_height = mobj.group(1)
981
982         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
983         if mobj is None:
984             self._downloader.trouble(u'ERROR: unable to extract video width')
985             return
986         yv_video_width = mobj.group(1)
987
988         # Retrieve video playlist to extract media URL
989         # I'm not completely sure what all these options are, but we
990         # seem to need most of them, otherwise the server sends a 401.
991         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
992         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
993         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
994                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
995                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
996         try:
997             self.report_download_webpage(video_id)
998             webpage = compat_urllib_request.urlopen(request).read()
999         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1000             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1001             return
1002
1003         # Extract media URL from playlist XML
1004         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1005         if mobj is None:
1006             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1007             return
1008         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1009         video_url = unescapeHTML(video_url)
1010
1011         return [{
1012             'id':       video_id.decode('utf-8'),
1013             'url':      video_url,
1014             'uploader': video_uploader,
1015             'upload_date':  None,
1016             'title':    video_title,
1017             'ext':      video_extension.decode('utf-8'),
1018             'thumbnail':    video_thumbnail.decode('utf-8'),
1019             'description':  video_description,
1020         }]
1021
1022
1023 class VimeoIE(InfoExtractor):
1024     """Information extractor for vimeo.com."""
1025
1026     # _VALID_URL matches Vimeo URLs
1027     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1028     IE_NAME = u'vimeo'
1029
1030     def __init__(self, downloader=None):
1031         InfoExtractor.__init__(self, downloader)
1032
1033     def report_download_webpage(self, video_id):
1034         """Report webpage download."""
1035         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1036
1037     def report_extraction(self, video_id):
1038         """Report information extraction."""
1039         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1040
1041     def _real_extract(self, url, new_video=True):
1042         # Extract ID from URL
1043         mobj = re.match(self._VALID_URL, url)
1044         if mobj is None:
1045             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1046             return
1047
1048         video_id = mobj.group('id')
1049         if not mobj.group('proto'):
1050             url = 'https://' + url
1051         if mobj.group('direct_link'):
1052             url = 'https://vimeo.com/' + video_id
1053
1054         # Retrieve video webpage to extract further information
1055         request = compat_urllib_request.Request(url, None, std_headers)
1056         try:
1057             self.report_download_webpage(video_id)
1058             webpage_bytes = compat_urllib_request.urlopen(request).read()
1059             webpage = webpage_bytes.decode('utf-8')
1060         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1062             return
1063
1064         # Now we begin extracting as much information as we can from what we
1065         # retrieved. First we extract the information common to all extractors,
1066         # and latter we extract those that are Vimeo specific.
1067         self.report_extraction(video_id)
1068
1069         # Extract the config JSON
1070         try:
1071             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072             config = json.loads(config)
1073         except:
1074             self._downloader.trouble(u'ERROR: unable to extract info section')
1075             return
1076
1077         # Extract title
1078         video_title = config["video"]["title"]
1079
1080         # Extract uploader and uploader_id
1081         video_uploader = config["video"]["owner"]["name"]
1082         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1083
1084         # Extract video thumbnail
1085         video_thumbnail = config["video"]["thumbnail"]
1086
1087         # Extract video description
1088         video_description = get_element_by_attribute("itemprop", "description", webpage)
1089         if video_description: video_description = clean_html(video_description)
1090         else: video_description = ''
1091
1092         # Extract upload date
1093         video_upload_date = None
1094         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1095         if mobj is not None:
1096             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1097
1098         # Vimeo specific: extract request signature and timestamp
1099         sig = config['request']['signature']
1100         timestamp = config['request']['timestamp']
1101
1102         # Vimeo specific: extract video codec and quality information
1103         # First consider quality, then codecs, then take everything
1104         # TODO bind to format param
1105         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106         files = { 'hd': [], 'sd': [], 'other': []}
1107         for codec_name, codec_extension in codecs:
1108             if codec_name in config["video"]["files"]:
1109                 if 'hd' in config["video"]["files"][codec_name]:
1110                     files['hd'].append((codec_name, codec_extension, 'hd'))
1111                 elif 'sd' in config["video"]["files"][codec_name]:
1112                     files['sd'].append((codec_name, codec_extension, 'sd'))
1113                 else:
1114                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116         for quality in ('hd', 'sd', 'other'):
1117             if len(files[quality]) > 0:
1118                 video_quality = files[quality][0][2]
1119                 video_codec = files[quality][0][0]
1120                 video_extension = files[quality][0][1]
1121                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122                 break
1123         else:
1124             self._downloader.trouble(u'ERROR: no known codec found')
1125             return
1126
1127         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130         return [{
1131             'id':       video_id,
1132             'url':      video_url,
1133             'uploader': video_uploader,
1134             'uploader_id': video_uploader_id,
1135             'upload_date':  video_upload_date,
1136             'title':    video_title,
1137             'ext':      video_extension,
1138             'thumbnail':    video_thumbnail,
1139             'description':  video_description,
1140         }]
1141
1142
1143 class ArteTvIE(InfoExtractor):
1144     """arte.tv information extractor."""
1145
1146     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1147     _LIVE_URL = r'index-[0-9]+\.html$'
1148
1149     IE_NAME = u'arte.tv'
1150
1151     def __init__(self, downloader=None):
1152         InfoExtractor.__init__(self, downloader)
1153
1154     def report_download_webpage(self, video_id):
1155         """Report webpage download."""
1156         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1157
1158     def report_extraction(self, video_id):
1159         """Report information extraction."""
1160         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1161
1162     def fetch_webpage(self, url):
1163         request = compat_urllib_request.Request(url)
1164         try:
1165             self.report_download_webpage(url)
1166             webpage = compat_urllib_request.urlopen(request).read()
1167         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1168             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1169             return
1170         except ValueError as err:
1171             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172             return
1173         return webpage
1174
1175     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1176         page = self.fetch_webpage(url)
1177         mobj = re.search(regex, page, regexFlags)
1178         info = {}
1179
1180         if mobj is None:
1181             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1182             return
1183
1184         for (i, key, err) in matchTuples:
1185             if mobj.group(i) is None:
1186                 self._downloader.trouble(err)
1187                 return
1188             else:
1189                 info[key] = mobj.group(i)
1190
1191         return info
1192
1193     def extractLiveStream(self, url):
1194         video_lang = url.split('/')[-4]
1195         info = self.grep_webpage(
1196             url,
1197             r'src="(.*?/videothek_js.*?\.js)',
1198             0,
1199             [
1200                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1201             ]
1202         )
1203         http_host = url.split('/')[2]
1204         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1205         info = self.grep_webpage(
1206             next_url,
1207             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1208                 '(http://.*?\.swf).*?' +
1209                 '(rtmp://.*?)\'',
1210             re.DOTALL,
1211             [
1212                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1213                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1214                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1215             ]
1216         )
1217         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1218
1219     def extractPlus7Stream(self, url):
1220         video_lang = url.split('/')[-3]
1221         info = self.grep_webpage(
1222             url,
1223             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1224             0,
1225             [
1226                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1227             ]
1228         )
1229         next_url = compat_urllib_parse.unquote(info.get('url'))
1230         info = self.grep_webpage(
1231             next_url,
1232             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1233             0,
1234             [
1235                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1236             ]
1237         )
1238         next_url = compat_urllib_parse.unquote(info.get('url'))
1239
1240         info = self.grep_webpage(
1241             next_url,
1242             r'<video id="(.*?)".*?>.*?' +
1243                 '<name>(.*?)</name>.*?' +
1244                 '<dateVideo>(.*?)</dateVideo>.*?' +
1245                 '<url quality="hd">(.*?)</url>',
1246             re.DOTALL,
1247             [
1248                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1249                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1250                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1251                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1252             ]
1253         )
1254
1255         return {
1256             'id':           info.get('id'),
1257             'url':          compat_urllib_parse.unquote(info.get('url')),
1258             'uploader':     u'arte.tv',
1259             'upload_date':  info.get('date'),
1260             'title':        info.get('title').decode('utf-8'),
1261             'ext':          u'mp4',
1262             'format':       u'NA',
1263             'player_url':   None,
1264         }
1265
1266     def _real_extract(self, url):
1267         video_id = url.split('/')[-1]
1268         self.report_extraction(video_id)
1269
1270         if re.search(self._LIVE_URL, video_id) is not None:
1271             self.extractLiveStream(url)
1272             return
1273         else:
1274             info = self.extractPlus7Stream(url)
1275
1276         return [info]
1277
1278
1279 class GenericIE(InfoExtractor):
1280     """Generic last-resort information extractor."""
1281
1282     _VALID_URL = r'.*'
1283     IE_NAME = u'generic'
1284
1285     def __init__(self, downloader=None):
1286         InfoExtractor.__init__(self, downloader)
1287
1288     def report_download_webpage(self, video_id):
1289         """Report webpage download."""
1290         if not self._downloader.params.get('test', False):
1291             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1292         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1293
1294     def report_extraction(self, video_id):
1295         """Report information extraction."""
1296         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1297
1298     def report_following_redirect(self, new_url):
1299         """Report information extraction."""
1300         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1301
1302     def _test_redirect(self, url):
1303         """Check if it is a redirect, like url shorteners, in case restart chain."""
1304         class HeadRequest(compat_urllib_request.Request):
1305             def get_method(self):
1306                 return "HEAD"
1307
1308         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1309             """
1310             Subclass the HTTPRedirectHandler to make it use our
1311             HeadRequest also on the redirected URL
1312             """
1313             def redirect_request(self, req, fp, code, msg, headers, newurl):
1314                 if code in (301, 302, 303, 307):
1315                     newurl = newurl.replace(' ', '%20')
1316                     newheaders = dict((k,v) for k,v in req.headers.items()
1317                                       if k.lower() not in ("content-length", "content-type"))
1318                     return HeadRequest(newurl,
1319                                        headers=newheaders,
1320                                        origin_req_host=req.get_origin_req_host(),
1321                                        unverifiable=True)
1322                 else:
1323                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1324
1325         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1326             """
1327             Fallback to GET if HEAD is not allowed (405 HTTP error)
1328             """
1329             def http_error_405(self, req, fp, code, msg, headers):
1330                 fp.read()
1331                 fp.close()
1332
1333                 newheaders = dict((k,v) for k,v in req.headers.items()
1334                                   if k.lower() not in ("content-length", "content-type"))
1335                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1336                                                  headers=newheaders,
1337                                                  origin_req_host=req.get_origin_req_host(),
1338                                                  unverifiable=True))
1339
1340         # Build our opener
1341         opener = compat_urllib_request.OpenerDirector()
1342         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1343                         HTTPMethodFallback, HEADRedirectHandler,
1344                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1345             opener.add_handler(handler())
1346
1347         response = opener.open(HeadRequest(url))
1348         new_url = response.geturl()
1349
1350         if url == new_url:
1351             return False
1352
1353         self.report_following_redirect(new_url)
1354         self._downloader.download([new_url])
1355         return True
1356
1357     def _real_extract(self, url):
1358         if self._test_redirect(url): return
1359
1360         video_id = url.split('/')[-1]
1361         try:
1362             webpage = self._download_webpage(url, video_id)
1363         except ValueError as err:
1364             # since this is the last-resort InfoExtractor, if
1365             # this error is thrown, it'll be thrown here
1366             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1367             return
1368
1369         self.report_extraction(video_id)
1370         # Start with something easy: JW Player in SWFObject
1371         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1372         if mobj is None:
1373             # Broaden the search a little bit
1374             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1375         if mobj is None:
1376             # Broaden the search a little bit: JWPlayer JS loader
1377             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1378         if mobj is None:
1379             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380             return
1381
1382         # It's possible that one of the regexes
1383         # matched, but returned an empty group:
1384         if mobj.group(1) is None:
1385             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1386             return
1387
1388         video_url = compat_urllib_parse.unquote(mobj.group(1))
1389         video_id = os.path.basename(video_url)
1390
1391         # here's a fun little line of code for you:
1392         video_extension = os.path.splitext(video_id)[1][1:]
1393         video_id = os.path.splitext(video_id)[0]
1394
1395         # it's tempting to parse this further, but you would
1396         # have to take into account all the variations like
1397         #   Video Title - Site Name
1398         #   Site Name | Video Title
1399         #   Video Title - Tagline | Site Name
1400         # and so on and so forth; it's just not practical
1401         mobj = re.search(r'<title>(.*)</title>', webpage)
1402         if mobj is None:
1403             self._downloader.trouble(u'ERROR: unable to extract title')
1404             return
1405         video_title = mobj.group(1)
1406
1407         # video uploader is domain name
1408         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1409         if mobj is None:
1410             self._downloader.trouble(u'ERROR: unable to extract title')
1411             return
1412         video_uploader = mobj.group(1)
1413
1414         return [{
1415             'id':       video_id,
1416             'url':      video_url,
1417             'uploader': video_uploader,
1418             'upload_date':  None,
1419             'title':    video_title,
1420             'ext':      video_extension,
1421         }]
1422
1423
1424 class YoutubeSearchIE(InfoExtractor):
1425     """Information Extractor for YouTube search queries."""
1426     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1427     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1428     _max_youtube_results = 1000
1429     IE_NAME = u'youtube:search'
1430
1431     def __init__(self, downloader=None):
1432         InfoExtractor.__init__(self, downloader)
1433
1434     def report_download_page(self, query, pagenum):
1435         """Report attempt to download search page with given number."""
1436         query = query.decode(preferredencoding())
1437         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1438
1439     def _real_extract(self, query):
1440         mobj = re.match(self._VALID_URL, query)
1441         if mobj is None:
1442             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1443             return
1444
1445         prefix, query = query.split(':')
1446         prefix = prefix[8:]
1447         query = query.encode('utf-8')
1448         if prefix == '':
1449             self._download_n_results(query, 1)
1450             return
1451         elif prefix == 'all':
1452             self._download_n_results(query, self._max_youtube_results)
1453             return
1454         else:
1455             try:
1456                 n = int(prefix)
1457                 if n <= 0:
1458                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1459                     return
1460                 elif n > self._max_youtube_results:
1461                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1462                     n = self._max_youtube_results
1463                 self._download_n_results(query, n)
1464                 return
1465             except ValueError: # parsing prefix as integer fails
1466                 self._download_n_results(query, 1)
1467                 return
1468
1469     def _download_n_results(self, query, n):
1470         """Downloads a specified number of results for a query"""
1471
1472         video_ids = []
1473         pagenum = 0
1474         limit = n
1475
1476         while (50 * pagenum) < limit:
1477             self.report_download_page(query, pagenum+1)
1478             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479             request = compat_urllib_request.Request(result_url)
1480             try:
1481                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1484                 return
1485             api_response = json.loads(data)['data']
1486
1487             if not 'items' in api_response:
1488                 self._downloader.trouble(u'[youtube] No video results')
1489                 return
1490
1491             new_ids = list(video['id'] for video in api_response['items'])
1492             video_ids += new_ids
1493
1494             limit = min(n, api_response['totalItems'])
1495             pagenum += 1
1496
1497         if len(video_ids) > n:
1498             video_ids = video_ids[:n]
1499         for id in video_ids:
1500             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1501         return
1502
1503
1504 class GoogleSearchIE(InfoExtractor):
1505     """Information Extractor for Google Video search queries."""
1506     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1507     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1508     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1509     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1510     _max_google_results = 1000
1511     IE_NAME = u'video.google:search'
1512
1513     def __init__(self, downloader=None):
1514         InfoExtractor.__init__(self, downloader)
1515
1516     def report_download_page(self, query, pagenum):
1517         """Report attempt to download playlist page with given number."""
1518         query = query.decode(preferredencoding())
1519         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1520
1521     def _real_extract(self, query):
1522         mobj = re.match(self._VALID_URL, query)
1523         if mobj is None:
1524             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1525             return
1526
1527         prefix, query = query.split(':')
1528         prefix = prefix[8:]
1529         query = query.encode('utf-8')
1530         if prefix == '':
1531             self._download_n_results(query, 1)
1532             return
1533         elif prefix == 'all':
1534             self._download_n_results(query, self._max_google_results)
1535             return
1536         else:
1537             try:
1538                 n = int(prefix)
1539                 if n <= 0:
1540                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1541                     return
1542                 elif n > self._max_google_results:
1543                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1544                     n = self._max_google_results
1545                 self._download_n_results(query, n)
1546                 return
1547             except ValueError: # parsing prefix as integer fails
1548                 self._download_n_results(query, 1)
1549                 return
1550
1551     def _download_n_results(self, query, n):
1552         """Downloads a specified number of results for a query"""
1553
1554         video_ids = []
1555         pagenum = 0
1556
1557         while True:
1558             self.report_download_page(query, pagenum)
1559             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1560             request = compat_urllib_request.Request(result_url)
1561             try:
1562                 page = compat_urllib_request.urlopen(request).read()
1563             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1564                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1565                 return
1566
1567             # Extract video identifiers
1568             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1569                 video_id = mobj.group(1)
1570                 if video_id not in video_ids:
1571                     video_ids.append(video_id)
1572                     if len(video_ids) == n:
1573                         # Specified n videos reached
1574                         for id in video_ids:
1575                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1576                         return
1577
1578             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1579                 for id in video_ids:
1580                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1581                 return
1582
1583             pagenum = pagenum + 1
1584
1585
1586 class YahooSearchIE(InfoExtractor):
1587     """Information Extractor for Yahoo! Video search queries."""
1588
1589     _WORKING = False
1590     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1591     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1592     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1593     _MORE_PAGES_INDICATOR = r'\s*Next'
1594     _max_yahoo_results = 1000
1595     IE_NAME = u'video.yahoo:search'
1596
1597     def __init__(self, downloader=None):
1598         InfoExtractor.__init__(self, downloader)
1599
1600     def report_download_page(self, query, pagenum):
1601         """Report attempt to download playlist page with given number."""
1602         query = query.decode(preferredencoding())
1603         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1604
1605     def _real_extract(self, query):
1606         mobj = re.match(self._VALID_URL, query)
1607         if mobj is None:
1608             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1609             return
1610
1611         prefix, query = query.split(':')
1612         prefix = prefix[8:]
1613         query = query.encode('utf-8')
1614         if prefix == '':
1615             self._download_n_results(query, 1)
1616             return
1617         elif prefix == 'all':
1618             self._download_n_results(query, self._max_yahoo_results)
1619             return
1620         else:
1621             try:
1622                 n = int(prefix)
1623                 if n <= 0:
1624                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1625                     return
1626                 elif n > self._max_yahoo_results:
1627                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1628                     n = self._max_yahoo_results
1629                 self._download_n_results(query, n)
1630                 return
1631             except ValueError: # parsing prefix as integer fails
1632                 self._download_n_results(query, 1)
1633                 return
1634
1635     def _download_n_results(self, query, n):
1636         """Downloads a specified number of results for a query"""
1637
1638         video_ids = []
1639         already_seen = set()
1640         pagenum = 1
1641
1642         while True:
1643             self.report_download_page(query, pagenum)
1644             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1645             request = compat_urllib_request.Request(result_url)
1646             try:
1647                 page = compat_urllib_request.urlopen(request).read()
1648             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650                 return
1651
1652             # Extract video identifiers
1653             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1654                 video_id = mobj.group(1)
1655                 if video_id not in already_seen:
1656                     video_ids.append(video_id)
1657                     already_seen.add(video_id)
1658                     if len(video_ids) == n:
1659                         # Specified n videos reached
1660                         for id in video_ids:
1661                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662                         return
1663
1664             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1665                 for id in video_ids:
1666                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1667                 return
1668
1669             pagenum = pagenum + 1
1670
1671
1672 class YoutubePlaylistIE(InfoExtractor):
1673     """Information Extractor for YouTube playlists."""
1674
1675     _VALID_URL = r"""(?:
1676                         (?:https?://)?
1677                         (?:\w+\.)?
1678                         youtube\.com/
1679                         (?:
1680                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1681                            \? (?:.*?&)*? (?:p|a|list)=
1682                         |  user/.*?/user/
1683                         |  p/
1684                         |  user/.*?#[pg]/c/
1685                         )
1686                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1687                         .*
1688                      |
1689                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1690                      )"""
1691     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1692     _MAX_RESULTS = 50
1693     IE_NAME = u'youtube:playlist'
1694
1695     def __init__(self, downloader=None):
1696         InfoExtractor.__init__(self, downloader)
1697
1698     @classmethod
1699     def suitable(cls, url):
1700         """Receives a URL and returns True if suitable for this IE."""
1701         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1702
1703     def report_download_page(self, playlist_id, pagenum):
1704         """Report attempt to download playlist page with given number."""
1705         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1706
1707     def _real_extract(self, url):
1708         # Extract playlist id
1709         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1710         if mobj is None:
1711             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1712             return
1713
1714         # Download playlist videos from API
1715         playlist_id = mobj.group(1) or mobj.group(2)
1716         page_num = 1
1717         videos = []
1718
1719         while True:
1720             self.report_download_page(playlist_id, page_num)
1721
1722             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1723             try:
1724                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1725             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1726                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1727                 return
1728
1729             try:
1730                 response = json.loads(page)
1731             except ValueError as err:
1732                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1733                 return
1734
1735             if not 'feed' in response or not 'entry' in response['feed']:
1736                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1737                 return
1738             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1739                         for entry in response['feed']['entry']
1740                         if 'content' in entry ]
1741
1742             if len(response['feed']['entry']) < self._MAX_RESULTS:
1743                 break
1744             page_num += 1
1745
1746         videos = [v[1] for v in sorted(videos)]
1747         total = len(videos)
1748
1749         playliststart = self._downloader.params.get('playliststart', 1) - 1
1750         playlistend = self._downloader.params.get('playlistend', -1)
1751         if playlistend == -1:
1752             videos = videos[playliststart:]
1753         else:
1754             videos = videos[playliststart:playlistend]
1755
1756         if len(videos) == total:
1757             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1758         else:
1759             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1760
1761         for video in videos:
1762             self._downloader.download([video])
1763         return
1764
1765
1766 class YoutubeChannelIE(InfoExtractor):
1767     """Information Extractor for YouTube channels."""
1768
1769     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1770     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1771     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1772     IE_NAME = u'youtube:channel'
1773
1774     def report_download_page(self, channel_id, pagenum):
1775         """Report attempt to download channel page with given number."""
1776         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1777
1778     def _real_extract(self, url):
1779         # Extract channel id
1780         mobj = re.match(self._VALID_URL, url)
1781         if mobj is None:
1782             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1783             return
1784
1785         # Download channel pages
1786         channel_id = mobj.group(1)
1787         video_ids = []
1788         pagenum = 1
1789
1790         while True:
1791             self.report_download_page(channel_id, pagenum)
1792             url = self._TEMPLATE_URL % (channel_id, pagenum)
1793             request = compat_urllib_request.Request(url)
1794             try:
1795                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1796             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1797                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1798                 return
1799
1800             # Extract video identifiers
1801             ids_in_page = []
1802             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1803                 if mobj.group(1) not in ids_in_page:
1804                     ids_in_page.append(mobj.group(1))
1805             video_ids.extend(ids_in_page)
1806
1807             if self._MORE_PAGES_INDICATOR not in page:
1808                 break
1809             pagenum = pagenum + 1
1810
1811         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1812
1813         for id in video_ids:
1814             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1815         return
1816
1817
1818 class YoutubeUserIE(InfoExtractor):
1819     """Information Extractor for YouTube users."""
1820
1821     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1822     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1823     _GDATA_PAGE_SIZE = 50
1824     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1825     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1826     IE_NAME = u'youtube:user'
1827
1828     def __init__(self, downloader=None):
1829         InfoExtractor.__init__(self, downloader)
1830
1831     def report_download_page(self, username, start_index):
1832         """Report attempt to download user page."""
1833         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1834                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1835
1836     def _real_extract(self, url):
1837         # Extract username
1838         mobj = re.match(self._VALID_URL, url)
1839         if mobj is None:
1840             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1841             return
1842
1843         username = mobj.group(1)
1844
1845         # Download video ids using YouTube Data API. Result size per
1846         # query is limited (currently to 50 videos) so we need to query
1847         # page by page until there are no video ids - it means we got
1848         # all of them.
1849
1850         video_ids = []
1851         pagenum = 0
1852
1853         while True:
1854             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1855             self.report_download_page(username, start_index)
1856
1857             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1858
1859             try:
1860                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1861             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1863                 return
1864
1865             # Extract video identifiers
1866             ids_in_page = []
1867
1868             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1869                 if mobj.group(1) not in ids_in_page:
1870                     ids_in_page.append(mobj.group(1))
1871
1872             video_ids.extend(ids_in_page)
1873
1874             # A little optimization - if current page is not
1875             # "full", ie. does not contain PAGE_SIZE video ids then
1876             # we can assume that this page is the last one - there
1877             # are no more ids on further pages - no need to query
1878             # again.
1879
1880             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1881                 break
1882
1883             pagenum += 1
1884
1885         all_ids_count = len(video_ids)
1886         playliststart = self._downloader.params.get('playliststart', 1) - 1
1887         playlistend = self._downloader.params.get('playlistend', -1)
1888
1889         if playlistend == -1:
1890             video_ids = video_ids[playliststart:]
1891         else:
1892             video_ids = video_ids[playliststart:playlistend]
1893
1894         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1895                 (username, all_ids_count, len(video_ids)))
1896
1897         for video_id in video_ids:
1898             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1899
1900
1901 class BlipTVUserIE(InfoExtractor):
1902     """Information Extractor for blip.tv users."""
1903
1904     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1905     _PAGE_SIZE = 12
1906     IE_NAME = u'blip.tv:user'
1907
1908     def __init__(self, downloader=None):
1909         InfoExtractor.__init__(self, downloader)
1910
1911     def report_download_page(self, username, pagenum):
1912         """Report attempt to download user page."""
1913         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1914                 (self.IE_NAME, username, pagenum))
1915
1916     def _real_extract(self, url):
1917         # Extract username
1918         mobj = re.match(self._VALID_URL, url)
1919         if mobj is None:
1920             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1921             return
1922
1923         username = mobj.group(1)
1924
1925         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1926
1927         request = compat_urllib_request.Request(url)
1928
1929         try:
1930             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931             mobj = re.search(r'data-users-id="([^"]+)"', page)
1932             page_base = page_base % mobj.group(1)
1933         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1934             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1935             return
1936
1937
1938         # Download video ids using BlipTV Ajax calls. Result size per
1939         # query is limited (currently to 12 videos) so we need to query
1940         # page by page until there are no video ids - it means we got
1941         # all of them.
1942
1943         video_ids = []
1944         pagenum = 1
1945
1946         while True:
1947             self.report_download_page(username, pagenum)
1948             url = page_base + "&page=" + str(pagenum)
1949             request = compat_urllib_request.Request( url )
1950             try:
1951                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1952             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1953                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1954                 return
1955
1956             # Extract video identifiers
1957             ids_in_page = []
1958
1959             for mobj in re.finditer(r'href="/([^"]+)"', page):
1960                 if mobj.group(1) not in ids_in_page:
1961                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1962
1963             video_ids.extend(ids_in_page)
1964
1965             # A little optimization - if current page is not
1966             # "full", ie. does not contain PAGE_SIZE video ids then
1967             # we can assume that this page is the last one - there
1968             # are no more ids on further pages - no need to query
1969             # again.
1970
1971             if len(ids_in_page) < self._PAGE_SIZE:
1972                 break
1973
1974             pagenum += 1
1975
1976         all_ids_count = len(video_ids)
1977         playliststart = self._downloader.params.get('playliststart', 1) - 1
1978         playlistend = self._downloader.params.get('playlistend', -1)
1979
1980         if playlistend == -1:
1981             video_ids = video_ids[playliststart:]
1982         else:
1983             video_ids = video_ids[playliststart:playlistend]
1984
1985         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1986                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1987
1988         for video_id in video_ids:
1989             self._downloader.download([u'http://blip.tv/'+video_id])
1990
1991
1992 class DepositFilesIE(InfoExtractor):
1993     """Information extractor for depositfiles.com"""
1994
1995     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1996
1997     def report_download_webpage(self, file_id):
1998         """Report webpage download."""
1999         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2000
2001     def report_extraction(self, file_id):
2002         """Report information extraction."""
2003         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2004
2005     def _real_extract(self, url):
2006         file_id = url.split('/')[-1]
2007         # Rebuild url in english locale
2008         url = 'http://depositfiles.com/en/files/' + file_id
2009
2010         # Retrieve file webpage with 'Free download' button pressed
2011         free_download_indication = { 'gateway_result' : '1' }
2012         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2013         try:
2014             self.report_download_webpage(file_id)
2015             webpage = compat_urllib_request.urlopen(request).read()
2016         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2018             return
2019
2020         # Search for the real file URL
2021         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2022         if (mobj is None) or (mobj.group(1) is None):
2023             # Try to figure out reason of the error.
2024             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2025             if (mobj is not None) and (mobj.group(1) is not None):
2026                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2027                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2028             else:
2029                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2030             return
2031
2032         file_url = mobj.group(1)
2033         file_extension = os.path.splitext(file_url)[1][1:]
2034
2035         # Search for file title
2036         mobj = re.search(r'<b title="(.*?)">', webpage)
2037         if mobj is None:
2038             self._downloader.trouble(u'ERROR: unable to extract title')
2039             return
2040         file_title = mobj.group(1).decode('utf-8')
2041
2042         return [{
2043             'id':       file_id.decode('utf-8'),
2044             'url':      file_url.decode('utf-8'),
2045             'uploader': None,
2046             'upload_date':  None,
2047             'title':    file_title,
2048             'ext':      file_extension.decode('utf-8'),
2049         }]
2050
2051
2052 class FacebookIE(InfoExtractor):
2053     """Information Extractor for Facebook"""
2054
2055     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2056     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2057     _NETRC_MACHINE = 'facebook'
2058     IE_NAME = u'facebook'
2059
2060     def report_login(self):
2061         """Report attempt to log in."""
2062         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2063
2064     def _real_initialize(self):
2065         if self._downloader is None:
2066             return
2067
2068         useremail = None
2069         password = None
2070         downloader_params = self._downloader.params
2071
2072         # Attempt to use provided username and password or .netrc data
2073         if downloader_params.get('username', None) is not None:
2074             useremail = downloader_params['username']
2075             password = downloader_params['password']
2076         elif downloader_params.get('usenetrc', False):
2077             try:
2078                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2079                 if info is not None:
2080                     useremail = info[0]
2081                     password = info[2]
2082                 else:
2083                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2084             except (IOError, netrc.NetrcParseError) as err:
2085                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2086                 return
2087
2088         if useremail is None:
2089             return
2090
2091         # Log in
2092         login_form = {
2093             'email': useremail,
2094             'pass': password,
2095             'login': 'Log+In'
2096             }
2097         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2098         try:
2099             self.report_login()
2100             login_results = compat_urllib_request.urlopen(request).read()
2101             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2102                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2103                 return
2104         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2105             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2106             return
2107
2108     def _real_extract(self, url):
2109         mobj = re.match(self._VALID_URL, url)
2110         if mobj is None:
2111             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2112             return
2113         video_id = mobj.group('ID')
2114
2115         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2116         webpage = self._download_webpage(url, video_id)
2117
2118         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2119         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2120         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2121         if not m:
2122             raise ExtractorError(u'Cannot parse data')
2123         data = dict(json.loads(m.group(1)))
2124         params_raw = compat_urllib_parse.unquote(data['params'])
2125         params = json.loads(params_raw)
2126         video_url = params['hd_src']
2127         if not video_url:
2128             video_url = params['sd_src']
2129         if not video_url:
2130             raise ExtractorError(u'Cannot find video URL')
2131         video_duration = int(params['video_duration'])
2132
2133         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2134         if not m:
2135             raise ExtractorError(u'Cannot find title in webpage')
2136         video_title = unescapeHTML(m.group(1))
2137
2138         info = {
2139             'id': video_id,
2140             'title': video_title,
2141             'url': video_url,
2142             'ext': 'mp4',
2143             'duration': video_duration,
2144             'thumbnail': params['thumbnail_src'],
2145         }
2146         return [info]
2147
2148
2149 class BlipTVIE(InfoExtractor):
2150     """Information extractor for blip.tv"""
2151
2152     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2153     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2154     IE_NAME = u'blip.tv'
2155
2156     def report_extraction(self, file_id):
2157         """Report information extraction."""
2158         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2159
2160     def report_direct_download(self, title):
2161         """Report information extraction."""
2162         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2163
2164     def _real_extract(self, url):
2165         mobj = re.match(self._VALID_URL, url)
2166         if mobj is None:
2167             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2168             return
2169
2170         urlp = compat_urllib_parse_urlparse(url)
2171         if urlp.path.startswith('/play/'):
2172             request = compat_urllib_request.Request(url)
2173             response = compat_urllib_request.urlopen(request)
2174             redirecturl = response.geturl()
2175             rurlp = compat_urllib_parse_urlparse(redirecturl)
2176             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2177             url = 'http://blip.tv/a/a-' + file_id
2178             return self._real_extract(url)
2179
2180
2181         if '?' in url:
2182             cchar = '&'
2183         else:
2184             cchar = '?'
2185         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186         request = compat_urllib_request.Request(json_url)
2187         request.add_header('User-Agent', 'iTunes/10.6.1')
2188         self.report_extraction(mobj.group(1))
2189         info = None
2190         try:
2191             urlh = compat_urllib_request.urlopen(request)
2192             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2193                 basename = url.split('/')[-1]
2194                 title,ext = os.path.splitext(basename)
2195                 title = title.decode('UTF-8')
2196                 ext = ext.replace('.', '')
2197                 self.report_direct_download(title)
2198                 info = {
2199                     'id': title,
2200                     'url': url,
2201                     'uploader': None,
2202                     'upload_date': None,
2203                     'title': title,
2204                     'ext': ext,
2205                     'urlhandle': urlh
2206                 }
2207         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2208             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2209         if info is None: # Regular URL
2210             try:
2211                 json_code_bytes = urlh.read()
2212                 json_code = json_code_bytes.decode('utf-8')
2213             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2215                 return
2216
2217             try:
2218                 json_data = json.loads(json_code)
2219                 if 'Post' in json_data:
2220                     data = json_data['Post']
2221                 else:
2222                     data = json_data
2223
2224                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225                 video_url = data['media']['url']
2226                 umobj = re.match(self._URL_EXT, video_url)
2227                 if umobj is None:
2228                     raise ValueError('Can not determine filename extension')
2229                 ext = umobj.group(1)
2230
2231                 info = {
2232                     'id': data['item_id'],
2233                     'url': video_url,
2234                     'uploader': data['display_name'],
2235                     'upload_date': upload_date,
2236                     'title': data['title'],
2237                     'ext': ext,
2238                     'format': data['media']['mimeType'],
2239                     'thumbnail': data['thumbnailUrl'],
2240                     'description': data['description'],
2241                     'player_url': data['embedUrl'],
2242                     'user_agent': 'iTunes/10.6.1',
2243                 }
2244             except (ValueError,KeyError) as err:
2245                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2246                 return
2247
2248         return [info]
2249
2250
2251 class MyVideoIE(InfoExtractor):
2252     """Information Extractor for myvideo.de."""
2253
2254     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255     IE_NAME = u'myvideo'
2256
2257     def __init__(self, downloader=None):
2258         InfoExtractor.__init__(self, downloader)
2259
2260     def report_extraction(self, video_id):
2261         """Report information extraction."""
2262         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2263
2264     def _real_extract(self,url):
2265         mobj = re.match(self._VALID_URL, url)
2266         if mobj is None:
2267             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2268             return
2269
2270         video_id = mobj.group(1)
2271
2272         # Get video webpage
2273         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2274         webpage = self._download_webpage(webpage_url, video_id)
2275
2276         self.report_extraction(video_id)
2277         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2278                  webpage)
2279         if mobj is None:
2280             self._downloader.trouble(u'ERROR: unable to extract media URL')
2281             return
2282         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2283
2284         mobj = re.search('<title>([^<]+)</title>', webpage)
2285         if mobj is None:
2286             self._downloader.trouble(u'ERROR: unable to extract title')
2287             return
2288
2289         video_title = mobj.group(1)
2290
2291         return [{
2292             'id':       video_id,
2293             'url':      video_url,
2294             'uploader': None,
2295             'upload_date':  None,
2296             'title':    video_title,
2297             'ext':      u'flv',
2298         }]
2299
2300 class ComedyCentralIE(InfoExtractor):
2301     """Information extractor for The Daily Show and Colbert Report """
2302
2303     # urls can be abbreviations like :thedailyshow or :colbert
2304     # urls for episodes like:
2305     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2306     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2307     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2308     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2309                       |(https?://)?(www\.)?
2310                           (?P<showname>thedailyshow|colbertnation)\.com/
2311                          (full-episodes/(?P<episode>.*)|
2312                           (?P<clip>
2313                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2314                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2315                      $"""
2316
2317     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2318
2319     _video_extensions = {
2320         '3500': 'mp4',
2321         '2200': 'mp4',
2322         '1700': 'mp4',
2323         '1200': 'mp4',
2324         '750': 'mp4',
2325         '400': 'mp4',
2326     }
2327     _video_dimensions = {
2328         '3500': '1280x720',
2329         '2200': '960x540',
2330         '1700': '768x432',
2331         '1200': '640x360',
2332         '750': '512x288',
2333         '400': '384x216',
2334     }
2335
2336     @classmethod
2337     def suitable(cls, url):
2338         """Receives a URL and returns True if suitable for this IE."""
2339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2340
2341     def report_extraction(self, episode_id):
2342         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2343
2344     def report_config_download(self, episode_id, media_id):
2345         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2346
2347     def report_index_download(self, episode_id):
2348         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2349
2350     def _print_formats(self, formats):
2351         print('Available formats:')
2352         for x in formats:
2353             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2354
2355
2356     def _real_extract(self, url):
2357         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2358         if mobj is None:
2359             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2360             return
2361
2362         if mobj.group('shortname'):
2363             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2364                 url = u'http://www.thedailyshow.com/full-episodes/'
2365             else:
2366                 url = u'http://www.colbertnation.com/full-episodes/'
2367             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2368             assert mobj is not None
2369
2370         if mobj.group('clip'):
2371             if mobj.group('showname') == 'thedailyshow':
2372                 epTitle = mobj.group('tdstitle')
2373             else:
2374                 epTitle = mobj.group('cntitle')
2375             dlNewest = False
2376         else:
2377             dlNewest = not mobj.group('episode')
2378             if dlNewest:
2379                 epTitle = mobj.group('showname')
2380             else:
2381                 epTitle = mobj.group('episode')
2382
2383         req = compat_urllib_request.Request(url)
2384         self.report_extraction(epTitle)
2385         try:
2386             htmlHandle = compat_urllib_request.urlopen(req)
2387             html = htmlHandle.read()
2388             webpage = html.decode('utf-8')
2389         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2390             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2391             return
2392         if dlNewest:
2393             url = htmlHandle.geturl()
2394             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2395             if mobj is None:
2396                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2397                 return
2398             if mobj.group('episode') == '':
2399                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2400                 return
2401             epTitle = mobj.group('episode')
2402
2403         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2404
2405         if len(mMovieParams) == 0:
2406             # The Colbert Report embeds the information in a without
2407             # a URL prefix; so extract the alternate reference
2408             # and then add the URL prefix manually.
2409
2410             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2411             if len(altMovieParams) == 0:
2412                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2413                 return
2414             else:
2415                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2416
2417         uri = mMovieParams[0][1]
2418         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2419         self.report_index_download(epTitle)
2420         try:
2421             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2422         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2424             return
2425
2426         results = []
2427
2428         idoc = xml.etree.ElementTree.fromstring(indexXml)
2429         itemEls = idoc.findall('.//item')
2430         for partNum,itemEl in enumerate(itemEls):
2431             mediaId = itemEl.findall('./guid')[0].text
2432             shortMediaId = mediaId.split(':')[-1]
2433             showId = mediaId.split(':')[-2].replace('.com', '')
2434             officialTitle = itemEl.findall('./title')[0].text
2435             officialDate = itemEl.findall('./pubDate')[0].text
2436
2437             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2438                         compat_urllib_parse.urlencode({'uri': mediaId}))
2439             configReq = compat_urllib_request.Request(configUrl)
2440             self.report_config_download(epTitle, shortMediaId)
2441             try:
2442                 configXml = compat_urllib_request.urlopen(configReq).read()
2443             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2445                 return
2446
2447             cdoc = xml.etree.ElementTree.fromstring(configXml)
2448             turls = []
2449             for rendition in cdoc.findall('.//rendition'):
2450                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2451                 turls.append(finfo)
2452
2453             if len(turls) == 0:
2454                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2455                 continue
2456
2457             if self._downloader.params.get('listformats', None):
2458                 self._print_formats([i[0] for i in turls])
2459                 return
2460
2461             # For now, just pick the highest bitrate
2462             format,rtmp_video_url = turls[-1]
2463
2464             # Get the format arg from the arg stream
2465             req_format = self._downloader.params.get('format', None)
2466
2467             # Select format if we can find one
2468             for f,v in turls:
2469                 if f == req_format:
2470                     format, rtmp_video_url = f, v
2471                     break
2472
2473             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2474             if not m:
2475                 raise ExtractorError(u'Cannot transform RTMP url')
2476             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2477             video_url = base + m.group('finalid')
2478
2479             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2480             info = {
2481                 'id': shortMediaId,
2482                 'url': video_url,
2483                 'uploader': showId,
2484                 'upload_date': officialDate,
2485                 'title': effTitle,
2486                 'ext': 'mp4',
2487                 'format': format,
2488                 'thumbnail': None,
2489                 'description': officialTitle,
2490             }
2491             results.append(info)
2492
2493         return results
2494
2495
2496 class EscapistIE(InfoExtractor):
2497     """Information extractor for The Escapist """
2498
2499     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2500     IE_NAME = u'escapist'
2501
2502     def report_extraction(self, showName):
2503         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2504
2505     def report_config_download(self, showName):
2506         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2507
2508     def _real_extract(self, url):
2509         mobj = re.match(self._VALID_URL, url)
2510         if mobj is None:
2511             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2512             return
2513         showName = mobj.group('showname')
2514         videoId = mobj.group('episode')
2515
2516         self.report_extraction(showName)
2517         try:
2518             webPage = compat_urllib_request.urlopen(url)
2519             webPageBytes = webPage.read()
2520             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2521             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2522         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2524             return
2525
2526         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2527         description = unescapeHTML(descMatch.group(1))
2528         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2529         imgUrl = unescapeHTML(imgMatch.group(1))
2530         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2531         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2532         configUrlMatch = re.search('config=(.*)$', playerUrl)
2533         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2534
2535         self.report_config_download(showName)
2536         try:
2537             configJSON = compat_urllib_request.urlopen(configUrl)
2538             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2539             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2540         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2541             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2542             return
2543
2544         # Technically, it's JavaScript, not JSON
2545         configJSON = configJSON.replace("'", '"')
2546
2547         try:
2548             config = json.loads(configJSON)
2549         except (ValueError,) as err:
2550             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2551             return
2552
2553         playlist = config['playlist']
2554         videoUrl = playlist[1]['url']
2555
2556         info = {
2557             'id': videoId,
2558             'url': videoUrl,
2559             'uploader': showName,
2560             'upload_date': None,
2561             'title': showName,
2562             'ext': 'mp4',
2563             'thumbnail': imgUrl,
2564             'description': description,
2565             'player_url': playerUrl,
2566         }
2567
2568         return [info]
2569
2570 class CollegeHumorIE(InfoExtractor):
2571     """Information extractor for collegehumor.com"""
2572
2573     _WORKING = False
2574     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2575     IE_NAME = u'collegehumor'
2576
2577     def report_manifest(self, video_id):
2578         """Report information extraction."""
2579         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2580
2581     def report_extraction(self, video_id):
2582         """Report information extraction."""
2583         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2584
2585     def _real_extract(self, url):
2586         mobj = re.match(self._VALID_URL, url)
2587         if mobj is None:
2588             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2589             return
2590         video_id = mobj.group('videoid')
2591
2592         info = {
2593             'id': video_id,
2594             'uploader': None,
2595             'upload_date': None,
2596         }
2597
2598         self.report_extraction(video_id)
2599         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2600         try:
2601             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2602         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2604             return
2605
2606         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2607         try:
2608             videoNode = mdoc.findall('./video')[0]
2609             info['description'] = videoNode.findall('./description')[0].text
2610             info['title'] = videoNode.findall('./caption')[0].text
2611             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2612             manifest_url = videoNode.findall('./file')[0].text
2613         except IndexError:
2614             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2615             return
2616
2617         manifest_url += '?hdcore=2.10.3'
2618         self.report_manifest(video_id)
2619         try:
2620             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2621         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2622             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2623             return
2624
2625         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2626         try:
2627             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2628             node_id = media_node.attrib['url']
2629             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2630         except IndexError as err:
2631             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2632             return
2633
2634         url_pr = compat_urllib_parse_urlparse(manifest_url)
2635         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2636
2637         info['url'] = url
2638         info['ext'] = 'f4f'
2639         return [info]
2640
2641
2642 class XVideosIE(InfoExtractor):
2643     """Information extractor for xvideos.com"""
2644
2645     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2646     IE_NAME = u'xvideos'
2647
2648     def report_extraction(self, video_id):
2649         """Report information extraction."""
2650         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2651
2652     def _real_extract(self, url):
2653         mobj = re.match(self._VALID_URL, url)
2654         if mobj is None:
2655             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2656             return
2657         video_id = mobj.group(1)
2658
2659         webpage = self._download_webpage(url, video_id)
2660
2661         self.report_extraction(video_id)
2662
2663
2664         # Extract video URL
2665         mobj = re.search(r'flv_url=(.+?)&', webpage)
2666         if mobj is None:
2667             self._downloader.trouble(u'ERROR: unable to extract video url')
2668             return
2669         video_url = compat_urllib_parse.unquote(mobj.group(1))
2670
2671
2672         # Extract title
2673         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2674         if mobj is None:
2675             self._downloader.trouble(u'ERROR: unable to extract video title')
2676             return
2677         video_title = mobj.group(1)
2678
2679
2680         # Extract video thumbnail
2681         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2682         if mobj is None:
2683             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2684             return
2685         video_thumbnail = mobj.group(0)
2686
2687         info = {
2688             'id': video_id,
2689             'url': video_url,
2690             'uploader': None,
2691             'upload_date': None,
2692             'title': video_title,
2693             'ext': 'flv',
2694             'thumbnail': video_thumbnail,
2695             'description': None,
2696         }
2697
2698         return [info]
2699
2700
2701 class SoundcloudIE(InfoExtractor):
2702     """Information extractor for soundcloud.com
2703        To access the media, the uid of the song and a stream token
2704        must be extracted from the page source and the script must make
2705        a request to media.soundcloud.com/crossdomain.xml. Then
2706        the media can be grabbed by requesting from an url composed
2707        of the stream token and uid
2708      """
2709
2710     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2711     IE_NAME = u'soundcloud'
2712
2713     def __init__(self, downloader=None):
2714         InfoExtractor.__init__(self, downloader)
2715
2716     def report_resolve(self, video_id):
2717         """Report information extraction."""
2718         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2719
2720     def report_extraction(self, video_id):
2721         """Report information extraction."""
2722         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2723
2724     def _real_extract(self, url):
2725         mobj = re.match(self._VALID_URL, url)
2726         if mobj is None:
2727             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2728             return
2729
2730         # extract uploader (which is in the url)
2731         uploader = mobj.group(1)
2732         # extract simple title (uploader + slug of song title)
2733         slug_title =  mobj.group(2)
2734         simple_title = uploader + u'-' + slug_title
2735
2736         self.report_resolve('%s/%s' % (uploader, slug_title))
2737
2738         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2739         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2740         request = compat_urllib_request.Request(resolv_url)
2741         try:
2742             info_json_bytes = compat_urllib_request.urlopen(request).read()
2743             info_json = info_json_bytes.decode('utf-8')
2744         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2745             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2746             return
2747
2748         info = json.loads(info_json)
2749         video_id = info['id']
2750         self.report_extraction('%s/%s' % (uploader, slug_title))
2751
2752         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2753         request = compat_urllib_request.Request(streams_url)
2754         try:
2755             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2756             stream_json = stream_json_bytes.decode('utf-8')
2757         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2758             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2759             return
2760
2761         streams = json.loads(stream_json)
2762         mediaURL = streams['http_mp3_128_url']
2763
2764         return [{
2765             'id':       info['id'],
2766             'url':      mediaURL,
2767             'uploader': info['user']['username'],
2768             'upload_date':  info['created_at'],
2769             'title':    info['title'],
2770             'ext':      u'mp3',
2771             'description': info['description'],
2772         }]
2773
2774
2775 class InfoQIE(InfoExtractor):
2776     """Information extractor for infoq.com"""
2777     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2778
2779     def report_extraction(self, video_id):
2780         """Report information extraction."""
2781         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2782
2783     def _real_extract(self, url):
2784         mobj = re.match(self._VALID_URL, url)
2785         if mobj is None:
2786             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2787             return
2788
2789         webpage = self._download_webpage(url, video_id=url)
2790         self.report_extraction(url)
2791
2792         # Extract video URL
2793         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2794         if mobj is None:
2795             self._downloader.trouble(u'ERROR: unable to extract video url')
2796             return
2797         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2798         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2799
2800         # Extract title
2801         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2802         if mobj is None:
2803             self._downloader.trouble(u'ERROR: unable to extract video title')
2804             return
2805         video_title = mobj.group(1)
2806
2807         # Extract description
2808         video_description = u'No description available.'
2809         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2810         if mobj is not None:
2811             video_description = mobj.group(1)
2812
2813         video_filename = video_url.split('/')[-1]
2814         video_id, extension = video_filename.split('.')
2815
2816         info = {
2817             'id': video_id,
2818             'url': video_url,
2819             'uploader': None,
2820             'upload_date': None,
2821             'title': video_title,
2822             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2823             'thumbnail': None,
2824             'description': video_description,
2825         }
2826
2827         return [info]
2828
2829 class MixcloudIE(InfoExtractor):
2830     """Information extractor for www.mixcloud.com"""
2831
2832     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2833     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834     IE_NAME = u'mixcloud'
2835
2836     def __init__(self, downloader=None):
2837         InfoExtractor.__init__(self, downloader)
2838
2839     def report_download_json(self, file_id):
2840         """Report JSON download."""
2841         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2842
2843     def report_extraction(self, file_id):
2844         """Report information extraction."""
2845         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2846
2847     def get_urls(self, jsonData, fmt, bitrate='best'):
2848         """Get urls from 'audio_formats' section in json"""
2849         file_url = None
2850         try:
2851             bitrate_list = jsonData[fmt]
2852             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2853                 bitrate = max(bitrate_list) # select highest
2854
2855             url_list = jsonData[fmt][bitrate]
2856         except TypeError: # we have no bitrate info.
2857             url_list = jsonData[fmt]
2858         return url_list
2859
2860     def check_urls(self, url_list):
2861         """Returns 1st active url from list"""
2862         for url in url_list:
2863             try:
2864                 compat_urllib_request.urlopen(url)
2865                 return url
2866             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867                 url = None
2868
2869         return None
2870
2871     def _print_formats(self, formats):
2872         print('Available formats:')
2873         for fmt in formats.keys():
2874             for b in formats[fmt]:
2875                 try:
2876                     ext = formats[fmt][b][0]
2877                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2878                 except TypeError: # we have no bitrate info
2879                     ext = formats[fmt][0]
2880                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2881                     break
2882
2883     def _real_extract(self, url):
2884         mobj = re.match(self._VALID_URL, url)
2885         if mobj is None:
2886             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2887             return
2888         # extract uploader & filename from url
2889         uploader = mobj.group(1).decode('utf-8')
2890         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2891
2892         # construct API request
2893         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2894         # retrieve .json file with links to files
2895         request = compat_urllib_request.Request(file_url)
2896         try:
2897             self.report_download_json(file_url)
2898             jsonData = compat_urllib_request.urlopen(request).read()
2899         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2901             return
2902
2903         # parse JSON
2904         json_data = json.loads(jsonData)
2905         player_url = json_data['player_swf_url']
2906         formats = dict(json_data['audio_formats'])
2907
2908         req_format = self._downloader.params.get('format', None)
2909         bitrate = None
2910
2911         if self._downloader.params.get('listformats', None):
2912             self._print_formats(formats)
2913             return
2914
2915         if req_format is None or req_format == 'best':
2916             for format_param in formats.keys():
2917                 url_list = self.get_urls(formats, format_param)
2918                 # check urls
2919                 file_url = self.check_urls(url_list)
2920                 if file_url is not None:
2921                     break # got it!
2922         else:
2923             if req_format not in formats:
2924                 self._downloader.trouble(u'ERROR: format is not available')
2925                 return
2926
2927             url_list = self.get_urls(formats, req_format)
2928             file_url = self.check_urls(url_list)
2929             format_param = req_format
2930
2931         return [{
2932             'id': file_id.decode('utf-8'),
2933             'url': file_url.decode('utf-8'),
2934             'uploader': uploader.decode('utf-8'),
2935             'upload_date': None,
2936             'title': json_data['name'],
2937             'ext': file_url.split('.')[-1].decode('utf-8'),
2938             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2939             'thumbnail': json_data['thumbnail_url'],
2940             'description': json_data['description'],
2941             'player_url': player_url.decode('utf-8'),
2942         }]
2943
2944 class StanfordOpenClassroomIE(InfoExtractor):
2945     """Information extractor for Stanford's Open ClassRoom"""
2946
2947     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2948     IE_NAME = u'stanfordoc'
2949
2950     def report_download_webpage(self, objid):
2951         """Report information extraction."""
2952         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2953
2954     def report_extraction(self, video_id):
2955         """Report information extraction."""
2956         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2957
2958     def _real_extract(self, url):
2959         mobj = re.match(self._VALID_URL, url)
2960         if mobj is None:
2961             raise ExtractorError(u'Invalid URL: %s' % url)
2962
2963         if mobj.group('course') and mobj.group('video'): # A specific video
2964             course = mobj.group('course')
2965             video = mobj.group('video')
2966             info = {
2967                 'id': course + '_' + video,
2968                 'uploader': None,
2969                 'upload_date': None,
2970             }
2971
2972             self.report_extraction(info['id'])
2973             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2974             xmlUrl = baseUrl + video + '.xml'
2975             try:
2976                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2977             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2978                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2979                 return
2980             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2981             try:
2982                 info['title'] = mdoc.findall('./title')[0].text
2983                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2984             except IndexError:
2985                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2986                 return
2987             info['ext'] = info['url'].rpartition('.')[2]
2988             return [info]
2989         elif mobj.group('course'): # A course page
2990             course = mobj.group('course')
2991             info = {
2992                 'id': course,
2993                 'type': 'playlist',
2994                 'uploader': None,
2995                 'upload_date': None,
2996             }
2997
2998             coursepage = self._download_webpage(url, info['id'],
2999                                         note='Downloading course info page',
3000                                         errnote='Unable to download course info page')
3001
3002             m = re.search('<h1>([^<]+)</h1>', coursepage)
3003             if m:
3004                 info['title'] = unescapeHTML(m.group(1))
3005             else:
3006                 info['title'] = info['id']
3007
3008             m = re.search('<description>([^<]+)</description>', coursepage)
3009             if m:
3010                 info['description'] = unescapeHTML(m.group(1))
3011
3012             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3013             info['list'] = [
3014                 {
3015                     'type': 'reference',
3016                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3017                 }
3018                     for vpage in links]
3019             results = []
3020             for entry in info['list']:
3021                 assert entry['type'] == 'reference'
3022                 results += self.extract(entry['url'])
3023             return results
3024         else: # Root page
3025             info = {
3026                 'id': 'Stanford OpenClassroom',
3027                 'type': 'playlist',
3028                 'uploader': None,
3029                 'upload_date': None,
3030             }
3031
3032             self.report_download_webpage(info['id'])
3033             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3034             try:
3035                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3036             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3038                 return
3039
3040             info['title'] = info['id']
3041
3042             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3043             info['list'] = [
3044                 {
3045                     'type': 'reference',
3046                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3047                 }
3048                     for cpage in links]
3049
3050             results = []
3051             for entry in info['list']:
3052                 assert entry['type'] == 'reference'
3053                 results += self.extract(entry['url'])
3054             return results
3055
3056 class MTVIE(InfoExtractor):
3057     """Information extractor for MTV.com"""
3058
3059     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3060     IE_NAME = u'mtv'
3061
3062     def report_extraction(self, video_id):
3063         """Report information extraction."""
3064         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3065
3066     def _real_extract(self, url):
3067         mobj = re.match(self._VALID_URL, url)
3068         if mobj is None:
3069             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070             return
3071         if not mobj.group('proto'):
3072             url = 'http://' + url
3073         video_id = mobj.group('videoid')
3074
3075         webpage = self._download_webpage(url, video_id)
3076
3077         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3078         if mobj is None:
3079             self._downloader.trouble(u'ERROR: unable to extract song name')
3080             return
3081         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3083         if mobj is None:
3084             self._downloader.trouble(u'ERROR: unable to extract performer')
3085             return
3086         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3087         video_title = performer + ' - ' + song_name
3088
3089         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3090         if mobj is None:
3091             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3092             return
3093         mtvn_uri = mobj.group(1)
3094
3095         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3096         if mobj is None:
3097             self._downloader.trouble(u'ERROR: unable to extract content id')
3098             return
3099         content_id = mobj.group(1)
3100
3101         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3102         self.report_extraction(video_id)
3103         request = compat_urllib_request.Request(videogen_url)
3104         try:
3105             metadataXml = compat_urllib_request.urlopen(request).read()
3106         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3108             return
3109
3110         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3111         renditions = mdoc.findall('.//rendition')
3112
3113         # For now, always pick the highest quality.
3114         rendition = renditions[-1]
3115
3116         try:
3117             _,_,ext = rendition.attrib['type'].partition('/')
3118             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3119             video_url = rendition.find('./src').text
3120         except KeyError:
3121             self._downloader.trouble('Invalid rendition field.')
3122             return
3123
3124         info = {
3125             'id': video_id,
3126             'url': video_url,
3127             'uploader': performer,
3128             'upload_date': None,
3129             'title': video_title,
3130             'ext': ext,
3131             'format': format,
3132         }
3133
3134         return [info]
3135
3136
3137 class YoukuIE(InfoExtractor):
3138     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3139
3140     def report_download_webpage(self, file_id):
3141         """Report webpage download."""
3142         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3143
3144     def report_extraction(self, file_id):
3145         """Report information extraction."""
3146         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3147
3148     def _gen_sid(self):
3149         nowTime = int(time.time() * 1000)
3150         random1 = random.randint(1000,1998)
3151         random2 = random.randint(1000,9999)
3152
3153         return "%d%d%d" %(nowTime,random1,random2)
3154
3155     def _get_file_ID_mix_string(self, seed):
3156         mixed = []
3157         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3158         seed = float(seed)
3159         for i in range(len(source)):
3160             seed  =  (seed * 211 + 30031 ) % 65536
3161             index  =  math.floor(seed / 65536 * len(source) )
3162             mixed.append(source[int(index)])
3163             source.remove(source[int(index)])
3164         #return ''.join(mixed)
3165         return mixed
3166
3167     def _get_file_id(self, fileId, seed):
3168         mixed = self._get_file_ID_mix_string(seed)
3169         ids = fileId.split('*')
3170         realId = []
3171         for ch in ids:
3172             if ch:
3173                 realId.append(mixed[int(ch)])
3174         return ''.join(realId)
3175
3176     def _real_extract(self, url):
3177         mobj = re.match(self._VALID_URL, url)
3178         if mobj is None:
3179             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3180             return
3181         video_id = mobj.group('ID')
3182
3183         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3184
3185         request = compat_urllib_request.Request(info_url, None, std_headers)
3186         try:
3187             self.report_download_webpage(video_id)
3188             jsondata = compat_urllib_request.urlopen(request).read()
3189         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3190             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3191             return
3192
3193         self.report_extraction(video_id)
3194         try:
3195             jsonstr = jsondata.decode('utf-8')
3196             config = json.loads(jsonstr)
3197
3198             video_title =  config['data'][0]['title']
3199             seed = config['data'][0]['seed']
3200
3201             format = self._downloader.params.get('format', None)
3202             supported_format = list(config['data'][0]['streamfileids'].keys())
3203
3204             if format is None or format == 'best':
3205                 if 'hd2' in supported_format:
3206                     format = 'hd2'
3207                 else:
3208                     format = 'flv'
3209                 ext = u'flv'
3210             elif format == 'worst':
3211                 format = 'mp4'
3212                 ext = u'mp4'
3213             else:
3214                 format = 'flv'
3215                 ext = u'flv'
3216
3217
3218             fileid = config['data'][0]['streamfileids'][format]
3219             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3220         except (UnicodeDecodeError, ValueError, KeyError):
3221             self._downloader.trouble(u'ERROR: unable to extract info section')
3222             return
3223
3224         files_info=[]
3225         sid = self._gen_sid()
3226         fileid = self._get_file_id(fileid, seed)
3227
3228         #column 8,9 of fileid represent the segment number
3229         #fileid[7:9] should be changed
3230         for index, key in enumerate(keys):
3231
3232             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3233             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3234
3235             info = {
3236                 'id': '%s_part%02d' % (video_id, index),
3237                 'url': download_url,
3238                 'uploader': None,
3239                 'upload_date': None,
3240                 'title': video_title,
3241                 'ext': ext,
3242             }
3243             files_info.append(info)
3244
3245         return files_info
3246
3247
3248 class XNXXIE(InfoExtractor):
3249     """Information extractor for xnxx.com"""
3250
3251     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3252     IE_NAME = u'xnxx'
3253     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3254     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3255     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3256
3257     def report_webpage(self, video_id):
3258         """Report information extraction"""
3259         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3260
3261     def report_extraction(self, video_id):
3262         """Report information extraction"""
3263         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3264
3265     def _real_extract(self, url):
3266         mobj = re.match(self._VALID_URL, url)
3267         if mobj is None:
3268             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3269             return
3270         video_id = mobj.group(1)
3271
3272         self.report_webpage(video_id)
3273
3274         # Get webpage content
3275         try:
3276             webpage_bytes = compat_urllib_request.urlopen(url).read()
3277             webpage = webpage_bytes.decode('utf-8')
3278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3279             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3280             return
3281
3282         result = re.search(self.VIDEO_URL_RE, webpage)
3283         if result is None:
3284             self._downloader.trouble(u'ERROR: unable to extract video url')
3285             return
3286         video_url = compat_urllib_parse.unquote(result.group(1))
3287
3288         result = re.search(self.VIDEO_TITLE_RE, webpage)
3289         if result is None:
3290             self._downloader.trouble(u'ERROR: unable to extract video title')
3291             return
3292         video_title = result.group(1)
3293
3294         result = re.search(self.VIDEO_THUMB_RE, webpage)
3295         if result is None:
3296             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3297             return
3298         video_thumbnail = result.group(1)
3299
3300         return [{
3301             'id': video_id,
3302             'url': video_url,
3303             'uploader': None,
3304             'upload_date': None,
3305             'title': video_title,
3306             'ext': 'flv',
3307             'thumbnail': video_thumbnail,
3308             'description': None,
3309         }]
3310
3311
3312 class GooglePlusIE(InfoExtractor):
3313     """Information extractor for plus.google.com."""
3314
3315     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3316     IE_NAME = u'plus.google'
3317
3318     def __init__(self, downloader=None):
3319         InfoExtractor.__init__(self, downloader)
3320
3321     def report_extract_entry(self, url):
3322         """Report downloading extry"""
3323         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3324
3325     def report_date(self, upload_date):
3326         """Report downloading extry"""
3327         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3328
3329     def report_uploader(self, uploader):
3330         """Report downloading extry"""
3331         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3332
3333     def report_title(self, video_title):
3334         """Report downloading extry"""
3335         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3336
3337     def report_extract_vid_page(self, video_page):
3338         """Report information extraction."""
3339         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3340
3341     def _real_extract(self, url):
3342         # Extract id from URL
3343         mobj = re.match(self._VALID_URL, url)
3344         if mobj is None:
3345             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3346             return
3347
3348         post_url = mobj.group(0)
3349         video_id = mobj.group(1)
3350
3351         video_extension = 'flv'
3352
3353         # Step 1, Retrieve post webpage to extract further information
3354         self.report_extract_entry(post_url)
3355         request = compat_urllib_request.Request(post_url)
3356         try:
3357             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3359             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3360             return
3361
3362         # Extract update date
3363         upload_date = None
3364         pattern = 'title="Timestamp">(.*?)</a>'
3365         mobj = re.search(pattern, webpage)
3366         if mobj:
3367             upload_date = mobj.group(1)
3368             # Convert timestring to a format suitable for filename
3369             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3370             upload_date = upload_date.strftime('%Y%m%d')
3371         self.report_date(upload_date)
3372
3373         # Extract uploader
3374         uploader = None
3375         pattern = r'rel\="author".*?>(.*?)</a>'
3376         mobj = re.search(pattern, webpage)
3377         if mobj:
3378             uploader = mobj.group(1)
3379         self.report_uploader(uploader)
3380
3381         # Extract title
3382         # Get the first line for title
3383         video_title = u'NA'
3384         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3385         mobj = re.search(pattern, webpage)
3386         if mobj:
3387             video_title = mobj.group(1)
3388         self.report_title(video_title)
3389
3390         # Step 2, Stimulate clicking the image box to launch video
3391         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3392         mobj = re.search(pattern, webpage)
3393         if mobj is None:
3394             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3395
3396         video_page = mobj.group(1)
3397         request = compat_urllib_request.Request(video_page)
3398         try:
3399             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3402             return
3403         self.report_extract_vid_page(video_page)
3404
3405
3406         # Extract video links on video page
3407         """Extract video links of all sizes"""
3408         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3409         mobj = re.findall(pattern, webpage)
3410         if len(mobj) == 0:
3411             self._downloader.trouble(u'ERROR: unable to extract video links')
3412
3413         # Sort in resolution
3414         links = sorted(mobj)
3415
3416         # Choose the lowest of the sort, i.e. highest resolution
3417         video_url = links[-1]
3418         # Only get the url. The resolution part in the tuple has no use anymore
3419         video_url = video_url[-1]
3420         # Treat escaped \u0026 style hex
3421         try:
3422             video_url = video_url.decode("unicode_escape")
3423         except AttributeError: # Python 3
3424             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3425
3426
3427         return [{
3428             'id':       video_id,
3429             'url':      video_url,
3430             'uploader': uploader,
3431             'upload_date':  upload_date,
3432             'title':    video_title,
3433             'ext':      video_extension,
3434         }]
3435
3436 class NBAIE(InfoExtractor):
3437     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3438     IE_NAME = u'nba'
3439
3440     def _real_extract(self, url):
3441         mobj = re.match(self._VALID_URL, url)
3442         if mobj is None:
3443             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3444             return
3445
3446         video_id = mobj.group(1)
3447         if video_id.endswith('/index.html'):
3448             video_id = video_id[:-len('/index.html')]
3449
3450         webpage = self._download_webpage(url, video_id)
3451
3452         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3453         def _findProp(rexp, default=None):
3454             m = re.search(rexp, webpage)
3455             if m:
3456                 return unescapeHTML(m.group(1))
3457             else:
3458                 return default
3459
3460         shortened_video_id = video_id.rpartition('/')[2]
3461         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3462         info = {
3463             'id': shortened_video_id,
3464             'url': video_url,
3465             'ext': 'mp4',
3466             'title': title,
3467             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3468             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3469         }
3470         return [info]
3471
3472 class JustinTVIE(InfoExtractor):
3473     """Information extractor for justin.tv and twitch.tv"""
3474     # TODO: One broadcast may be split into multiple videos. The key
3475     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3476     # starts at 1 and increases. Can we treat all parts as one video?
3477
3478     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3479         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3480     _JUSTIN_PAGE_LIMIT = 100
3481     IE_NAME = u'justin.tv'
3482
3483     def report_extraction(self, file_id):
3484         """Report information extraction."""
3485         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3486
3487     def report_download_page(self, channel, offset):
3488         """Report attempt to download a single page of videos."""
3489         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3490                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3491
3492     # Return count of items, list of *valid* items
3493     def _parse_page(self, url):
3494         try:
3495             urlh = compat_urllib_request.urlopen(url)
3496             webpage_bytes = urlh.read()
3497             webpage = webpage_bytes.decode('utf-8', 'ignore')
3498         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3499             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3500             return
3501
3502         response = json.loads(webpage)
3503         if type(response) != list:
3504             error_text = response.get('error', 'unknown error')
3505             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3506             return
3507         info = []
3508         for clip in response:
3509             video_url = clip['video_file_url']
3510             if video_url:
3511                 video_extension = os.path.splitext(video_url)[1][1:]
3512                 video_date = re.sub('-', '', clip['start_time'][:10])
3513                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3514                 video_id = clip['id']
3515                 video_title = clip.get('title', video_id)
3516                 info.append({
3517                     'id': video_id,
3518                     'url': video_url,
3519                     'title': video_title,
3520                     'uploader': clip.get('channel_name', video_uploader_id),
3521                     'uploader_id': video_uploader_id,
3522                     'upload_date': video_date,
3523                     'ext': video_extension,
3524                 })
3525         return (len(response), info)
3526
3527     def _real_extract(self, url):
3528         mobj = re.match(self._VALID_URL, url)
3529         if mobj is None:
3530             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3531             return
3532
3533         api = 'http://api.justin.tv'
3534         video_id = mobj.group(mobj.lastindex)
3535         paged = False
3536         if mobj.lastindex == 1:
3537             paged = True
3538             api += '/channel/archives/%s.json'
3539         else:
3540             api += '/broadcast/by_archive/%s.json'
3541         api = api % (video_id,)
3542
3543         self.report_extraction(video_id)
3544
3545         info = []
3546         offset = 0
3547         limit = self._JUSTIN_PAGE_LIMIT
3548         while True:
3549             if paged:
3550                 self.report_download_page(video_id, offset)
3551             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3552             page_count, page_info = self._parse_page(page_url)
3553             info.extend(page_info)
3554             if not paged or page_count != limit:
3555                 break
3556             offset += limit
3557         return info
3558
3559 class FunnyOrDieIE(InfoExtractor):
3560     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3561
3562     def _real_extract(self, url):
3563         mobj = re.match(self._VALID_URL, url)
3564         if mobj is None:
3565             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3566             return
3567
3568         video_id = mobj.group('id')
3569         webpage = self._download_webpage(url, video_id)
3570
3571         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3572         if not m:
3573             self._downloader.trouble(u'ERROR: unable to find video information')
3574         video_url = unescapeHTML(m.group('url'))
3575
3576         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3577         if not m:
3578             self._downloader.trouble(u'Cannot find video title')
3579         title = unescapeHTML(m.group('title'))
3580
3581         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3582         if m:
3583             desc = unescapeHTML(m.group('desc'))
3584         else:
3585             desc = None
3586
3587         info = {
3588             'id': video_id,
3589             'url': video_url,
3590             'ext': 'mp4',
3591             'title': title,
3592             'description': desc,
3593         }
3594         return [info]
3595
3596 class SteamIE(InfoExtractor):
3597     _VALID_URL = r"""http://store.steampowered.com/
3598                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3599                 (?P<gameID>\d+)/?
3600                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3601                 """
3602
3603     @classmethod
3604     def suitable(cls, url):
3605         """Receives a URL and returns True if suitable for this IE."""
3606         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3607
3608     def _real_extract(self, url):
3609         m = re.match(self._VALID_URL, url, re.VERBOSE)
3610         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3611         gameID = m.group('gameID')
3612         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3613         webpage = self._download_webpage(videourl, gameID)
3614         mweb = re.finditer(urlRE, webpage)
3615         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3616         titles = re.finditer(namesRE, webpage)
3617         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3618         thumbs = re.finditer(thumbsRE, webpage)
3619         videos = []
3620         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3621             video_id = vid.group('videoID')
3622             title = vtitle.group('videoName')
3623             video_url = vid.group('videoURL')
3624             video_thumb = thumb.group('thumbnail')
3625             if not video_url:
3626                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3627             info = {
3628                 'id':video_id,
3629                 'url':video_url,
3630                 'ext': 'flv',
3631                 'title': unescapeHTML(title),
3632                 'thumbnail': video_thumb
3633                   }
3634             videos.append(info)
3635         return videos
3636
3637 class UstreamIE(InfoExtractor):
3638     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3639     IE_NAME = u'ustream'
3640
3641     def _real_extract(self, url):
3642         m = re.match(self._VALID_URL, url)
3643         video_id = m.group('videoID')
3644         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3645         webpage = self._download_webpage(url, video_id)
3646         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3647         title = m.group('title')
3648         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3649         uploader = m.group('uploader')
3650         info = {
3651                 'id':video_id,
3652                 'url':video_url,
3653                 'ext': 'flv',
3654                 'title': title,
3655                 'uploader': uploader
3656                   }
3657         return [info]
3658
3659 class RBMARadioIE(InfoExtractor):
3660     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3661
3662     def _real_extract(self, url):
3663         m = re.match(self._VALID_URL, url)
3664         video_id = m.group('videoID')
3665
3666         webpage = self._download_webpage(url, video_id)
3667         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3668         if not m:
3669             raise ExtractorError(u'Cannot find metadata')
3670         json_data = m.group(1)
3671
3672         try:
3673             data = json.loads(json_data)
3674         except ValueError as e:
3675             raise ExtractorError(u'Invalid JSON: ' + str(e))
3676
3677         video_url = data['akamai_url'] + '&cbr=256'
3678         url_parts = compat_urllib_parse_urlparse(video_url)
3679         video_ext = url_parts.path.rpartition('.')[2]
3680         info = {
3681                 'id': video_id,
3682                 'url': video_url,
3683                 'ext': video_ext,
3684                 'title': data['title'],
3685                 'description': data.get('teaser_text'),
3686                 'location': data.get('country_of_origin'),
3687                 'uploader': data.get('host', {}).get('name'),
3688                 'uploader_id': data.get('host', {}).get('slug'),
3689                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3690                 'duration': data.get('duration'),
3691         }
3692         return [info]
3693
3694
3695 class YouPornIE(InfoExtractor):
3696     """Information extractor for youporn.com."""
3697     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3698
3699     def _print_formats(self, formats):
3700         """Print all available formats"""
3701         print(u'Available formats:')
3702         print(u'ext\t\tformat')
3703         print(u'---------------------------------')
3704         for format in formats:
3705             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3706
3707     def _specific(self, req_format, formats):
3708         for x in formats:
3709             if(x["format"]==req_format):
3710                 return x
3711         return None
3712
3713     def _real_extract(self, url):
3714         mobj = re.match(self._VALID_URL, url)
3715         if mobj is None:
3716             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3717             return
3718
3719         video_id = mobj.group('videoid')
3720
3721         req = compat_urllib_request.Request(url)
3722         req.add_header('Cookie', 'age_verified=1')
3723         webpage = self._download_webpage(req, video_id)
3724
3725         # Get the video title
3726         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3727         if result is None:
3728             raise ExtractorError(u'Unable to extract video title')
3729         video_title = result.group('title').strip()
3730
3731         # Get the video date
3732         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3733         if result is None:
3734             self._downloader.report_warning(u'unable to extract video date')
3735             upload_date = None
3736         else:
3737             upload_date = result.group('date').strip()
3738
3739         # Get the video uploader
3740         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3741         if result is None:
3742             self._downloader.report_warning(u'unable to extract uploader')
3743             video_uploader = None
3744         else:
3745             video_uploader = result.group('uploader').strip()
3746             video_uploader = clean_html( video_uploader )
3747
3748         # Get all of the formats available
3749         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3750         result = re.search(DOWNLOAD_LIST_RE, webpage)
3751         if result is None:
3752             raise ExtractorError(u'Unable to extract download list')
3753         download_list_html = result.group('download_list').strip()
3754
3755         # Get all of the links from the page
3756         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3757         links = re.findall(LINK_RE, download_list_html)
3758         if(len(links) == 0):
3759             raise ExtractorError(u'ERROR: no known formats available for video')
3760
3761         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3762
3763         formats = []
3764         for link in links:
3765
3766             # A link looks like this:
3767             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3768             # A path looks like this:
3769             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3770             video_url = unescapeHTML( link )
3771             path = compat_urllib_parse_urlparse( video_url ).path
3772             extension = os.path.splitext( path )[1][1:]
3773             format = path.split('/')[4].split('_')[:2]
3774             size = format[0]
3775             bitrate = format[1]
3776             format = "-".join( format )
3777             title = u'%s-%s-%s' % (video_title, size, bitrate)
3778
3779             formats.append({
3780                 'id': video_id,
3781                 'url': video_url,
3782                 'uploader': video_uploader,
3783                 'upload_date': upload_date,
3784                 'title': title,
3785                 'ext': extension,
3786                 'format': format,
3787                 'thumbnail': None,
3788                 'description': None,
3789                 'player_url': None
3790             })
3791
3792         if self._downloader.params.get('listformats', None):
3793             self._print_formats(formats)
3794             return
3795
3796         req_format = self._downloader.params.get('format', None)
3797         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3798
3799         if req_format is None or req_format == 'best':
3800             return [formats[0]]
3801         elif req_format == 'worst':
3802             return [formats[-1]]
3803         elif req_format in ('-1', 'all'):
3804             return formats
3805         else:
3806             format = self._specific( req_format, formats )
3807             if result is None:
3808                 self._downloader.trouble(u'ERROR: requested format not available')
3809                 return
3810             return [format]
3811
3812
3813
3814 class PornotubeIE(InfoExtractor):
3815     """Information extractor for pornotube.com."""
3816     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3817
3818     def _real_extract(self, url):
3819         mobj = re.match(self._VALID_URL, url)
3820         if mobj is None:
3821             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3822             return
3823
3824         video_id = mobj.group('videoid')
3825         video_title = mobj.group('title')
3826
3827         # Get webpage content
3828         webpage = self._download_webpage(url, video_id)
3829
3830         # Get the video URL
3831         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3832         result = re.search(VIDEO_URL_RE, webpage)
3833         if result is None:
3834             self._downloader.trouble(u'ERROR: unable to extract video url')
3835             return
3836         video_url = compat_urllib_parse.unquote(result.group('url'))
3837
3838         #Get the uploaded date
3839         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3840         result = re.search(VIDEO_UPLOADED_RE, webpage)
3841         if result is None:
3842             self._downloader.trouble(u'ERROR: unable to extract video title')
3843             return
3844         upload_date = result.group('date')
3845
3846         info = {'id': video_id,
3847                 'url': video_url,
3848                 'uploader': None,
3849                 'upload_date': upload_date,
3850                 'title': video_title,
3851                 'ext': 'flv',
3852                 'format': 'flv'}
3853
3854         return [info]
3855
3856 class YouJizzIE(InfoExtractor):
3857     """Information extractor for youjizz.com."""
3858     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3859
3860     def _real_extract(self, url):
3861         mobj = re.match(self._VALID_URL, url)
3862         if mobj is None:
3863             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3864             return
3865
3866         video_id = mobj.group('videoid')
3867
3868         # Get webpage content
3869         webpage = self._download_webpage(url, video_id)
3870
3871         # Get the video title
3872         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3873         if result is None:
3874             raise ExtractorError(u'ERROR: unable to extract video title')
3875         video_title = result.group('title').strip()
3876
3877         # Get the embed page
3878         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3879         if result is None:
3880             raise ExtractorError(u'ERROR: unable to extract embed page')
3881
3882         embed_page_url = result.group(0).strip()
3883         video_id = result.group('videoid')
3884
3885         webpage = self._download_webpage(embed_page_url, video_id)
3886
3887         # Get the video URL
3888         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3889         if result is None:
3890             raise ExtractorError(u'ERROR: unable to extract video url')
3891         video_url = result.group('source')
3892
3893         info = {'id': video_id,
3894                 'url': video_url,
3895                 'title': video_title,
3896                 'ext': 'flv',
3897                 'format': 'flv',
3898                 'player_url': embed_page_url}
3899
3900         return [info]
3901
3902 class EightTracksIE(InfoExtractor):
3903     IE_NAME = '8tracks'
3904     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3905
3906     def _real_extract(self, url):
3907         mobj = re.match(self._VALID_URL, url)
3908         if mobj is None:
3909             raise ExtractorError(u'Invalid URL: %s' % url)
3910         playlist_id = mobj.group('id')
3911
3912         webpage = self._download_webpage(url, playlist_id)
3913
3914         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3915         if not m:
3916             raise ExtractorError(u'Cannot find trax information')
3917         json_like = m.group(1)
3918         data = json.loads(json_like)
3919
3920         session = str(random.randint(0, 1000000000))
3921         mix_id = data['id']
3922         track_count = data['tracks_count']
3923         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3924         next_url = first_url
3925         res = []
3926         for i in itertools.count():
3927             api_json = self._download_webpage(next_url, playlist_id,
3928                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3929                 errnote=u'Failed to download song information')
3930             api_data = json.loads(api_json)
3931             track_data = api_data[u'set']['track']
3932             info = {
3933                 'id': track_data['id'],
3934                 'url': track_data['track_file_stream_url'],
3935                 'title': track_data['performer'] + u' - ' + track_data['name'],
3936                 'raw_title': track_data['name'],
3937                 'uploader_id': data['user']['login'],
3938                 'ext': 'm4a',
3939             }
3940             res.append(info)
3941             if api_data['set']['at_last_track']:
3942                 break
3943             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3944         return res
3945
3946 class KeekIE(InfoExtractor):
3947     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3948     IE_NAME = u'keek'
3949
3950     def _real_extract(self, url):
3951         m = re.match(self._VALID_URL, url)
3952         video_id = m.group('videoID')
3953         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3954         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3955         webpage = self._download_webpage(url, video_id)
3956         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3957         title = unescapeHTML(m.group('title'))
3958         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3959         uploader = unescapeHTML(m.group('uploader'))
3960         info = {
3961                 'id':video_id,
3962                 'url':video_url,
3963                 'ext': 'mp4',
3964                 'title': title,
3965                 'thumbnail': thumbnail,
3966                 'uploader': uploader
3967         }
3968         return [info]
3969
3970 class TEDIE(InfoExtractor):
3971     _VALID_URL=r'''http://www.ted.com/
3972                    (
3973                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3974                         |
3975                         ((?P<type_talk>talks)) # We have a simple talk
3976                    )
3977                    /(?P<name>\w+) # Here goes the name and then ".html"
3978                    '''
3979
3980     @classmethod
3981     def suitable(cls, url):
3982         """Receives a URL and returns True if suitable for this IE."""
3983         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3984
3985     def _real_extract(self, url):
3986         m=re.match(self._VALID_URL, url, re.VERBOSE)
3987         if m.group('type_talk'):
3988             return [self._talk_info(url)]
3989         else :
3990             playlist_id=m.group('playlist_id')
3991             name=m.group('name')
3992             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3993             return self._playlist_videos_info(url,name,playlist_id)
3994
3995     def _talk_video_link(self,mediaSlug):
3996         '''Returns the video link for that mediaSlug'''
3997         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3998
3999     def _playlist_videos_info(self,url,name,playlist_id=0):
4000         '''Returns the videos of the playlist'''
4001         video_RE=r'''
4002                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4003                      ([.\s]*?)data-playlist_item_id="(\d+)"
4004                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4005                      '''
4006         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4007         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4008         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4009         m_names=re.finditer(video_name_RE,webpage)
4010         info=[]
4011         for m_video, m_name in zip(m_videos,m_names):
4012             video_id=m_video.group('video_id')
4013             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4014             info.append(self._talk_info(talk_url,video_id))
4015         return info
4016
4017     def _talk_info(self, url, video_id=0):
4018         """Return the video for the talk in the url"""
4019         m=re.match(self._VALID_URL, url,re.VERBOSE)
4020         videoName=m.group('name')
4021         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4022         # If the url includes the language we get the title translated
4023         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4024         title=re.search(title_RE, webpage).group('title')
4025         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4026                         "id":(?P<videoID>[\d]+).*?
4027                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4028         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4029         thumb_match=re.search(thumb_RE,webpage)
4030         info_match=re.search(info_RE,webpage,re.VERBOSE)
4031         video_id=info_match.group('videoID')
4032         mediaSlug=info_match.group('mediaSlug')
4033         video_url=self._talk_video_link(mediaSlug)
4034         info = {
4035                 'id': video_id,
4036                 'url': video_url,
4037                 'ext': 'mp4',
4038                 'title': title,
4039                 'thumbnail': thumb_match.group('thumbnail')
4040                 }
4041         return info
4042
4043 class MySpassIE(InfoExtractor):
4044     _VALID_URL = r'http://www.myspass.de/.*'
4045
4046     def _real_extract(self, url):
4047         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4048
4049         # video id is the last path element of the URL
4050         # usually there is a trailing slash, so also try the second but last
4051         url_path = compat_urllib_parse_urlparse(url).path
4052         url_parent_path, video_id = os.path.split(url_path)
4053         if not video_id:
4054             _, video_id = os.path.split(url_parent_path)
4055
4056         # get metadata
4057         metadata_url = META_DATA_URL_TEMPLATE % video_id
4058         metadata_text = self._download_webpage(metadata_url, video_id)
4059         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4060
4061         # extract values from metadata
4062         url_flv_el = metadata.find('url_flv')
4063         if url_flv_el is None:
4064             self._downloader.trouble(u'ERROR: unable to extract download url')
4065             return
4066         video_url = url_flv_el.text
4067         extension = os.path.splitext(video_url)[1][1:]
4068         title_el = metadata.find('title')
4069         if title_el is None:
4070             self._downloader.trouble(u'ERROR: unable to extract title')
4071             return
4072         title = title_el.text
4073         format_id_el = metadata.find('format_id')
4074         if format_id_el is None:
4075             format = ext
4076         else:
4077             format = format_id_el.text
4078         description_el = metadata.find('description')
4079         if description_el is not None:
4080             description = description_el.text
4081         else:
4082             description = None
4083         imagePreview_el = metadata.find('imagePreview')
4084         if imagePreview_el is not None:
4085             thumbnail = imagePreview_el.text
4086         else:
4087             thumbnail = None
4088         info = {
4089             'id': video_id,
4090             'url': video_url,
4091             'title': title,
4092             'ext': extension,
4093             'format': format,
4094             'thumbnail': thumbnail,
4095             'description': description
4096         }
4097         return [info]
4098
4099 class SpiegelIE(InfoExtractor):
4100     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)$'
4101
4102     def _real_extract(self, url):
4103         m = re.match(self._VALID_URL, url)
4104         video_id = m.group('videoID')
4105
4106         webpage = self._download_webpage(url, video_id)
4107         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4108         if not m:
4109             raise ExtractorError(u'Cannot find title')
4110         video_title = unescapeHTML(m.group(1))
4111
4112         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4113         xml_code = self._download_webpage(xml_url, video_id,
4114                     note=u'Downloading XML', errnote=u'Failed to download XML')
4115
4116         idoc = xml.etree.ElementTree.fromstring(xml_code)
4117         last_type = idoc[-1]
4118         filename = last_type.findall('./filename')[0].text
4119         duration = float(last_type.findall('./duration')[0].text)
4120
4121         video_url = 'http://video2.spiegel.de/flash/' + filename
4122         video_ext = filename.rpartition('.')[2]
4123         info = {
4124             'id': video_id,
4125             'url': video_url,
4126             'ext': video_ext,
4127             'title': video_title,
4128             'duration': duration,
4129         }
4130         return [info]
4131
4132
4133 def gen_extractors():
4134     """ Return a list of an instance of every supported extractor.
4135     The order does matter; the first extractor matched is the one handling the URL.
4136     """
4137     return [
4138         YoutubePlaylistIE(),
4139         YoutubeChannelIE(),
4140         YoutubeUserIE(),
4141         YoutubeSearchIE(),
4142         YoutubeIE(),
4143         MetacafeIE(),
4144         DailymotionIE(),
4145         GoogleSearchIE(),
4146         PhotobucketIE(),
4147         YahooIE(),
4148         YahooSearchIE(),
4149         DepositFilesIE(),
4150         FacebookIE(),
4151         BlipTVUserIE(),
4152         BlipTVIE(),
4153         VimeoIE(),
4154         MyVideoIE(),
4155         ComedyCentralIE(),
4156         EscapistIE(),
4157         CollegeHumorIE(),
4158         XVideosIE(),
4159         SoundcloudIE(),
4160         InfoQIE(),
4161         MixcloudIE(),
4162         StanfordOpenClassroomIE(),
4163         MTVIE(),
4164         YoukuIE(),
4165         XNXXIE(),
4166         YouJizzIE(),
4167         PornotubeIE(),
4168         YouPornIE(),
4169         GooglePlusIE(),
4170         ArteTvIE(),
4171         NBAIE(),
4172         JustinTVIE(),
4173         FunnyOrDieIE(),
4174         SteamIE(),
4175         UstreamIE(),
4176         RBMARadioIE(),
4177         EightTracksIE(),
4178         KeekIE(),
4179         TEDIE(),
4180         MySpassIE(),
4181         SpiegelIE(),
4182         GenericIE()
4183     ]
4184
4185