Added new option '--only-srt' to download only the subtitles of a video
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23     """Information Extractor class.
24
25     Information extractors are the classes that, given a URL, extract
26     information about the video (or videos) the URL refers to. This
27     information includes the real video URL, the video title, author and
28     others. The information is stored in a dictionary which is then
29     passed to the FileDownloader. The FileDownloader processes this
30     information possibly downloading the video to the file system, among
31     other possible outcomes.
32
33     The dictionaries must include the following fields:
34
35     id:             Video identifier.
36     url:            Final video URL.
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader:       Full name of the video uploader.
46     upload_date:    Video upload date (YYYYMMDD).
47     uploader_id:    Nickname or id of the video uploader.
48     location:       Physical location of the video.
49     player_url:     SWF Player URL (used for rtmpdump).
50     subtitles:      The .srt file contents.
51     urlhandle:      [internal] The urlHandle to be used to download the file,
52                     like returned by urllib.request.urlopen
53
54     The fields should all be Unicode strings.
55
56     Subclasses of this one should re-define the _real_initialize() and
57     _real_extract() methods and define a _VALID_URL regexp.
58     Probably, they should also be added to the list of extractors.
59
60     _real_extract() must return a *list* of information dictionaries as
61     described above.
62
63     Finally, the _WORKING attribute should be set to False for broken IEs
64     in order to warn the users and skip the tests.
65     """
66
67     _ready = False
68     _downloader = None
69     _WORKING = True
70
71     def __init__(self, downloader=None):
72         """Constructor. Receives an optional downloader."""
73         self._ready = False
74         self.set_downloader(downloader)
75
76     def suitable(self, url):
77         """Receives a URL and returns True if suitable for this IE."""
78         return re.match(self._VALID_URL, url) is not None
79
80     def working(self):
81         """Getter method for _WORKING."""
82         return self._WORKING
83
84     def initialize(self):
85         """Initializes an instance (authentication, etc)."""
86         if not self._ready:
87             self._real_initialize()
88             self._ready = True
89
90     def extract(self, url):
91         """Extracts URL information and returns it in list of dicts."""
92         self.initialize()
93         return self._real_extract(url)
94
95     def set_downloader(self, downloader):
96         """Sets the downloader for this IE."""
97         self._downloader = downloader
98
99     def _real_initialize(self):
100         """Real initialization process. Redefine in subclasses."""
101         pass
102
103     def _real_extract(self, url):
104         """Real extraction process. Redefine in subclasses."""
105         pass
106
107     @property
108     def IE_NAME(self):
109         return type(self).__name__[:-2]
110
111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112         """ Returns the response handle """
113         if note is None:
114             note = u'Downloading video webpage'
115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116         try:
117             return compat_urllib_request.urlopen(url_or_request)
118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119             if errnote is None:
120                 errnote = u'Unable to download webpage'
121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124         """ Returns the data of the page as a string """
125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126         webpage_bytes = urlh.read()
127         return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131     """Information extractor for youtube.com."""
132
133     _VALID_URL = r"""^
134                      (
135                          (?:https?://)?                                       # http(s):// (optional)
136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
140                          (?:                                                  # the various things that can precede the ID:
141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
142                              |(?:                                             # or the v= param in all its forms
143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
146                                  v=
147                              )
148                          )?                                                   # optional -> youtube.com/xxxx is OK
149                      )?                                                       # all until now is optional -> you can pass the naked ID
150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
151                      (?(1).+)?                                                # if we found the ID, everything can follow
152                      $"""
153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157     _NETRC_MACHINE = 'youtube'
158     # Listed in order of quality
159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161     _video_extensions = {
162         '13': '3gp',
163         '17': 'mp4',
164         '18': 'mp4',
165         '22': 'mp4',
166         '37': 'mp4',
167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168         '43': 'webm',
169         '44': 'webm',
170         '45': 'webm',
171         '46': 'webm',
172     }
173     _video_dimensions = {
174         '5': '240x400',
175         '6': '???',
176         '13': '???',
177         '17': '144x176',
178         '18': '360x640',
179         '22': '720x1280',
180         '34': '360x640',
181         '35': '480x854',
182         '37': '1080x1920',
183         '38': '3072x4096',
184         '43': '360x640',
185         '44': '480x854',
186         '45': '720x1280',
187         '46': '1080x1920',
188     }
189     IE_NAME = u'youtube'
190
191     def suitable(self, url):
192         """Receives a URL and returns True if suitable for this IE."""
193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195     def report_lang(self):
196         """Report attempt to set language."""
197         self._downloader.to_screen(u'[youtube] Setting language')
198
199     def report_login(self):
200         """Report attempt to log in."""
201         self._downloader.to_screen(u'[youtube] Logging in')
202
203     def report_age_confirmation(self):
204         """Report attempt to confirm age."""
205         self._downloader.to_screen(u'[youtube] Confirming age')
206
207     def report_video_webpage_download(self, video_id):
208         """Report attempt to download video webpage."""
209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211     def report_video_info_webpage_download(self, video_id):
212         """Report attempt to download video info webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215     def report_video_subtitles_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219     def report_information_extraction(self, video_id):
220         """Report attempt to extract video information."""
221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223     def report_unavailable_format(self, video_id, format):
224         """Report extracted video URL."""
225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227     def report_rtmp_download(self):
228         """Indicate the download will use the RTMP protocol."""
229         self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231
232     def _extract_subtitles(self, video_id):
233         self.report_video_subtitles_download(video_id)
234         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
235         try:
236             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
237         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
238             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
239         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
240         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
241         if not srt_lang_list:
242             return (u'WARNING: video has no closed captions', None)
243         if self._downloader.params.get('subtitleslang', False):
244             srt_lang = self._downloader.params.get('subtitleslang')
245         elif 'en' in srt_lang_list:
246             srt_lang = 'en'
247         else:
248             srt_lang = list(srt_lang_list.keys())[0]
249         if not srt_lang in srt_lang_list:
250             return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None)
251         params = compat_urllib_parse.urlencode({
252             'lang': srt_lang,
253             'name': srt_lang_list[srt_lang].encode('utf-8'),
254             'v': video_id,
255             'fmt': 'srt',
256         })
257         url = 'http://www.youtube.com/api/timedtext?' + params
258         try:
259             srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
260         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
261             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
262         if not srt:
263             return (u'WARNING: Did not fetch video subtitles', None)
264         return (None, srt)
265
266     def _print_formats(self, formats):
267         print('Available formats:')
268         for x in formats:
269             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
270
271     def _real_initialize(self):
272         if self._downloader is None:
273             return
274
275         username = None
276         password = None
277         downloader_params = self._downloader.params
278
279         # Attempt to use provided username and password or .netrc data
280         if downloader_params.get('username', None) is not None:
281             username = downloader_params['username']
282             password = downloader_params['password']
283         elif downloader_params.get('usenetrc', False):
284             try:
285                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
286                 if info is not None:
287                     username = info[0]
288                     password = info[2]
289                 else:
290                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
291             except (IOError, netrc.NetrcParseError) as err:
292                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
293                 return
294
295         # Set language
296         request = compat_urllib_request.Request(self._LANG_URL)
297         try:
298             self.report_lang()
299             compat_urllib_request.urlopen(request).read()
300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
301             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
302             return
303
304         # No authentication to be performed
305         if username is None:
306             return
307
308         request = compat_urllib_request.Request(self._LOGIN_URL)
309         try:
310             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
313             return
314
315         galx = None
316         dsh = None
317         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
318         if match:
319           galx = match.group(1)
320
321         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
322         if match:
323           dsh = match.group(1)
324
325         # Log in
326         login_form_strs = {
327                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
328                 u'Email': username,
329                 u'GALX': galx,
330                 u'Passwd': password,
331                 u'PersistentCookie': u'yes',
332                 u'_utf8': u'霱',
333                 u'bgresponse': u'js_disabled',
334                 u'checkConnection': u'',
335                 u'checkedDomains': u'youtube',
336                 u'dnConn': u'',
337                 u'dsh': dsh,
338                 u'pstMsg': u'0',
339                 u'rmShown': u'1',
340                 u'secTok': u'',
341                 u'signIn': u'Sign in',
342                 u'timeStmp': u'',
343                 u'service': u'youtube',
344                 u'uilel': u'3',
345                 u'hl': u'en_US',
346         }
347         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
348         # chokes on unicode
349         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
350         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
351         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
352         try:
353             self.report_login()
354             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
355             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
356                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
357                 return
358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
359             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
360             return
361
362         # Confirm age
363         age_form = {
364                 'next_url':     '/',
365                 'action_confirm':   'Confirm',
366                 }
367         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
368         try:
369             self.report_age_confirmation()
370             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
373             return
374
375     def _extract_id(self, url):
376         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
377         if mobj is None:
378             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
379             return
380         video_id = mobj.group(2)
381         return video_id
382
383     def _real_extract(self, url):
384         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
385         mobj = re.search(self._NEXT_URL_RE, url)
386         if mobj:
387             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
388         video_id = self._extract_id(url)
389
390         # Get video webpage
391         self.report_video_webpage_download(video_id)
392         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
393         request = compat_urllib_request.Request(url)
394         try:
395             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
398             return
399
400         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
401
402         # Attempt to extract SWF player URL
403         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
404         if mobj is not None:
405             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
406         else:
407             player_url = None
408
409         # Get video info
410         self.report_video_info_webpage_download(video_id)
411         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
412             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
413                     % (video_id, el_type))
414             request = compat_urllib_request.Request(video_info_url)
415             try:
416                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
417                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
418                 video_info = compat_parse_qs(video_info_webpage)
419                 if 'token' in video_info:
420                     break
421             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
423                 return
424         if 'token' not in video_info:
425             if 'reason' in video_info:
426                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
427             else:
428                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
429             return
430
431         # Check for "rental" videos
432         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
433             self._downloader.trouble(u'ERROR: "rental" videos not supported')
434             return
435
436         # Start extracting information
437         self.report_information_extraction(video_id)
438
439         # uploader
440         if 'author' not in video_info:
441             self._downloader.trouble(u'ERROR: unable to extract uploader name')
442             return
443         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
444
445         # uploader_id
446         video_uploader_id = None
447         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
448         if mobj is not None:
449             video_uploader_id = mobj.group(1)
450         else:
451             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
452
453         # title
454         if 'title' not in video_info:
455             self._downloader.trouble(u'ERROR: unable to extract video title')
456             return
457         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
458
459         # thumbnail image
460         if 'thumbnail_url' not in video_info:
461             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
462             video_thumbnail = ''
463         else:   # don't panic if we can't find it
464             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
465
466         # upload date
467         upload_date = None
468         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
469         if mobj is not None:
470             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
471             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
472             for expression in format_expressions:
473                 try:
474                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
475                 except:
476                     pass
477
478         # description
479         video_description = get_element_by_id("eow-description", video_webpage)
480         if video_description:
481             video_description = clean_html(video_description)
482         else:
483             video_description = ''
484
485         # closed captions
486         video_subtitles = None
487         if self._downloader.params.get('subtitleslang', False):
488             self._downloader.params['writesubtitles'] = True
489         if self._downloader.params.get('onlysubtitles', False):
490             self._downloader.params['writesubtitles'] = True
491         if self._downloader.params.get('writesubtitles', False):
492             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
493             if srt_error:
494                 self._downloader.trouble(srt_error)
495
496         if 'length_seconds' not in video_info:
497             self._downloader.trouble(u'WARNING: unable to extract video duration')
498             video_duration = ''
499         else:
500             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
501
502         # token
503         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
504
505         # Decide which formats to download
506         req_format = self._downloader.params.get('format', None)
507
508         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
509             self.report_rtmp_download()
510             video_url_list = [(None, video_info['conn'][0])]
511         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
512             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
513             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
514             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
515             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
516
517             format_limit = self._downloader.params.get('format_limit', None)
518             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
519             if format_limit is not None and format_limit in available_formats:
520                 format_list = available_formats[available_formats.index(format_limit):]
521             else:
522                 format_list = available_formats
523             existing_formats = [x for x in format_list if x in url_map]
524             if len(existing_formats) == 0:
525                 self._downloader.trouble(u'ERROR: no known formats available for video')
526                 return
527             if self._downloader.params.get('listformats', None):
528                 self._print_formats(existing_formats)
529                 return
530             if req_format is None or req_format == 'best':
531                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
532             elif req_format == 'worst':
533                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
534             elif req_format in ('-1', 'all'):
535                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
536             else:
537                 # Specific formats. We pick the first in a slash-delimeted sequence.
538                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
539                 req_formats = req_format.split('/')
540                 video_url_list = None
541                 for rf in req_formats:
542                     if rf in url_map:
543                         video_url_list = [(rf, url_map[rf])]
544                         break
545                 if video_url_list is None:
546                     self._downloader.trouble(u'ERROR: requested format not available')
547                     return
548         else:
549             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
550             return
551
552         results = []
553         for format_param, video_real_url in video_url_list:
554             # Extension
555             video_extension = self._video_extensions.get(format_param, 'flv')
556
557             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
558                                               self._video_dimensions.get(format_param, '???'))
559
560             results.append({
561                 'id':       video_id,
562                 'url':      video_real_url,
563                 'uploader': video_uploader,
564                 'uploader_id': video_uploader_id,
565                 'upload_date':  upload_date,
566                 'title':    video_title,
567                 'ext':      video_extension,
568                 'format':   video_format,
569                 'thumbnail':    video_thumbnail,
570                 'description':  video_description,
571                 'player_url':   player_url,
572                 'subtitles':    video_subtitles,
573                 'duration':     video_duration
574             })
575         return results
576
577
578 class MetacafeIE(InfoExtractor):
579     """Information Extractor for metacafe.com."""
580
581     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
582     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
583     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
584     IE_NAME = u'metacafe'
585
586     def __init__(self, downloader=None):
587         InfoExtractor.__init__(self, downloader)
588
589     def report_disclaimer(self):
590         """Report disclaimer retrieval."""
591         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
592
593     def report_age_confirmation(self):
594         """Report attempt to confirm age."""
595         self._downloader.to_screen(u'[metacafe] Confirming age')
596
597     def report_download_webpage(self, video_id):
598         """Report webpage download."""
599         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
600
601     def report_extraction(self, video_id):
602         """Report information extraction."""
603         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
604
605     def _real_initialize(self):
606         # Retrieve disclaimer
607         request = compat_urllib_request.Request(self._DISCLAIMER)
608         try:
609             self.report_disclaimer()
610             disclaimer = compat_urllib_request.urlopen(request).read()
611         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
612             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
613             return
614
615         # Confirm age
616         disclaimer_form = {
617             'filters': '0',
618             'submit': "Continue - I'm over 18",
619             }
620         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
621         try:
622             self.report_age_confirmation()
623             disclaimer = compat_urllib_request.urlopen(request).read()
624         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
626             return
627
628     def _real_extract(self, url):
629         # Extract id and simplified title from URL
630         mobj = re.match(self._VALID_URL, url)
631         if mobj is None:
632             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
633             return
634
635         video_id = mobj.group(1)
636
637         # Check if video comes from YouTube
638         mobj2 = re.match(r'^yt-(.*)$', video_id)
639         if mobj2 is not None:
640             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
641             return
642
643         # Retrieve video webpage to extract further information
644         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
645         try:
646             self.report_download_webpage(video_id)
647             webpage = compat_urllib_request.urlopen(request).read()
648         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
649             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
650             return
651
652         # Extract URL, uploader and title from webpage
653         self.report_extraction(video_id)
654         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
655         if mobj is not None:
656             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
657             video_extension = mediaURL[-3:]
658
659             # Extract gdaKey if available
660             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
661             if mobj is None:
662                 video_url = mediaURL
663             else:
664                 gdaKey = mobj.group(1)
665                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
666         else:
667             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
668             if mobj is None:
669                 self._downloader.trouble(u'ERROR: unable to extract media URL')
670                 return
671             vardict = compat_parse_qs(mobj.group(1))
672             if 'mediaData' not in vardict:
673                 self._downloader.trouble(u'ERROR: unable to extract media URL')
674                 return
675             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
676             if mobj is None:
677                 self._downloader.trouble(u'ERROR: unable to extract media URL')
678                 return
679             mediaURL = mobj.group(1).replace('\\/', '/')
680             video_extension = mediaURL[-3:]
681             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
682
683         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
684         if mobj is None:
685             self._downloader.trouble(u'ERROR: unable to extract title')
686             return
687         video_title = mobj.group(1).decode('utf-8')
688
689         mobj = re.search(r'submitter=(.*?);', webpage)
690         if mobj is None:
691             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
692             return
693         video_uploader = mobj.group(1)
694
695         return [{
696             'id':       video_id.decode('utf-8'),
697             'url':      video_url.decode('utf-8'),
698             'uploader': video_uploader.decode('utf-8'),
699             'upload_date':  None,
700             'title':    video_title,
701             'ext':      video_extension.decode('utf-8'),
702         }]
703
704
705 class DailymotionIE(InfoExtractor):
706     """Information Extractor for Dailymotion"""
707
708     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
709     IE_NAME = u'dailymotion'
710     _WORKING = False
711
712     def __init__(self, downloader=None):
713         InfoExtractor.__init__(self, downloader)
714
715     def report_extraction(self, video_id):
716         """Report information extraction."""
717         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
718
719     def _real_extract(self, url):
720         # Extract id and simplified title from URL
721         mobj = re.match(self._VALID_URL, url)
722         if mobj is None:
723             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
724             return
725
726         video_id = mobj.group(1).split('_')[0].split('?')[0]
727
728         video_extension = 'mp4'
729
730         # Retrieve video webpage to extract further information
731         request = compat_urllib_request.Request(url)
732         request.add_header('Cookie', 'family_filter=off')
733         webpage = self._download_webpage(request, video_id)
734
735         # Extract URL, uploader and title from webpage
736         self.report_extraction(video_id)
737         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
738         if mobj is None:
739             self._downloader.trouble(u'ERROR: unable to extract media URL')
740             return
741         flashvars = compat_urllib_parse.unquote(mobj.group(1))
742
743         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
744             if key in flashvars:
745                 max_quality = key
746                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
747                 break
748         else:
749             self._downloader.trouble(u'ERROR: unable to extract video URL')
750             return
751
752         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
753         if mobj is None:
754             self._downloader.trouble(u'ERROR: unable to extract video URL')
755             return
756
757         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
758
759         # TODO: support choosing qualities
760
761         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
762         if mobj is None:
763             self._downloader.trouble(u'ERROR: unable to extract title')
764             return
765         video_title = unescapeHTML(mobj.group('title'))
766
767         video_uploader = None
768         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
769         if mobj is None:
770             # lookin for official user
771             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
772             if mobj_official is None:
773                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
774             else:
775                 video_uploader = mobj_official.group(1)
776         else:
777             video_uploader = mobj.group(1)
778
779         video_upload_date = None
780         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
781         if mobj is not None:
782             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
783
784         return [{
785             'id':       video_id,
786             'url':      video_url,
787             'uploader': video_uploader,
788             'upload_date':  video_upload_date,
789             'title':    video_title,
790             'ext':      video_extension,
791         }]
792
793
794 class PhotobucketIE(InfoExtractor):
795     """Information extractor for photobucket.com."""
796
797     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
798     IE_NAME = u'photobucket'
799
800     def __init__(self, downloader=None):
801         InfoExtractor.__init__(self, downloader)
802
803     def report_download_webpage(self, video_id):
804         """Report webpage download."""
805         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
806
807     def report_extraction(self, video_id):
808         """Report information extraction."""
809         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
810
811     def _real_extract(self, url):
812         # Extract id from URL
813         mobj = re.match(self._VALID_URL, url)
814         if mobj is None:
815             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
816             return
817
818         video_id = mobj.group(1)
819
820         video_extension = 'flv'
821
822         # Retrieve video webpage to extract further information
823         request = compat_urllib_request.Request(url)
824         try:
825             self.report_download_webpage(video_id)
826             webpage = compat_urllib_request.urlopen(request).read()
827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
829             return
830
831         # Extract URL, uploader, and title from webpage
832         self.report_extraction(video_id)
833         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
834         if mobj is None:
835             self._downloader.trouble(u'ERROR: unable to extract media URL')
836             return
837         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
838
839         video_url = mediaURL
840
841         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
842         if mobj is None:
843             self._downloader.trouble(u'ERROR: unable to extract title')
844             return
845         video_title = mobj.group(1).decode('utf-8')
846
847         video_uploader = mobj.group(2).decode('utf-8')
848
849         return [{
850             'id':       video_id.decode('utf-8'),
851             'url':      video_url.decode('utf-8'),
852             'uploader': video_uploader,
853             'upload_date':  None,
854             'title':    video_title,
855             'ext':      video_extension.decode('utf-8'),
856         }]
857
858
859 class YahooIE(InfoExtractor):
860     """Information extractor for video.yahoo.com."""
861
862     _WORKING = False
863     # _VALID_URL matches all Yahoo! Video URLs
864     # _VPAGE_URL matches only the extractable '/watch/' URLs
865     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
866     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
867     IE_NAME = u'video.yahoo'
868
869     def __init__(self, downloader=None):
870         InfoExtractor.__init__(self, downloader)
871
872     def report_download_webpage(self, video_id):
873         """Report webpage download."""
874         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
875
876     def report_extraction(self, video_id):
877         """Report information extraction."""
878         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
879
880     def _real_extract(self, url, new_video=True):
881         # Extract ID from URL
882         mobj = re.match(self._VALID_URL, url)
883         if mobj is None:
884             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
885             return
886
887         video_id = mobj.group(2)
888         video_extension = 'flv'
889
890         # Rewrite valid but non-extractable URLs as
891         # extractable English language /watch/ URLs
892         if re.match(self._VPAGE_URL, url) is None:
893             request = compat_urllib_request.Request(url)
894             try:
895                 webpage = compat_urllib_request.urlopen(request).read()
896             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
898                 return
899
900             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
901             if mobj is None:
902                 self._downloader.trouble(u'ERROR: Unable to extract id field')
903                 return
904             yahoo_id = mobj.group(1)
905
906             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
907             if mobj is None:
908                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
909                 return
910             yahoo_vid = mobj.group(1)
911
912             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
913             return self._real_extract(url, new_video=False)
914
915         # Retrieve video webpage to extract further information
916         request = compat_urllib_request.Request(url)
917         try:
918             self.report_download_webpage(video_id)
919             webpage = compat_urllib_request.urlopen(request).read()
920         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
921             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
922             return
923
924         # Extract uploader and title from webpage
925         self.report_extraction(video_id)
926         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
927         if mobj is None:
928             self._downloader.trouble(u'ERROR: unable to extract video title')
929             return
930         video_title = mobj.group(1).decode('utf-8')
931
932         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
933         if mobj is None:
934             self._downloader.trouble(u'ERROR: unable to extract video uploader')
935             return
936         video_uploader = mobj.group(1).decode('utf-8')
937
938         # Extract video thumbnail
939         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
940         if mobj is None:
941             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
942             return
943         video_thumbnail = mobj.group(1).decode('utf-8')
944
945         # Extract video description
946         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
947         if mobj is None:
948             self._downloader.trouble(u'ERROR: unable to extract video description')
949             return
950         video_description = mobj.group(1).decode('utf-8')
951         if not video_description:
952             video_description = 'No description available.'
953
954         # Extract video height and width
955         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
956         if mobj is None:
957             self._downloader.trouble(u'ERROR: unable to extract video height')
958             return
959         yv_video_height = mobj.group(1)
960
961         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: unable to extract video width')
964             return
965         yv_video_width = mobj.group(1)
966
967         # Retrieve video playlist to extract media URL
968         # I'm not completely sure what all these options are, but we
969         # seem to need most of them, otherwise the server sends a 401.
970         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
971         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
972         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
973                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
974                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
975         try:
976             self.report_download_webpage(video_id)
977             webpage = compat_urllib_request.urlopen(request).read()
978         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
980             return
981
982         # Extract media URL from playlist XML
983         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
984         if mobj is None:
985             self._downloader.trouble(u'ERROR: Unable to extract media URL')
986             return
987         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
988         video_url = unescapeHTML(video_url)
989
990         return [{
991             'id':       video_id.decode('utf-8'),
992             'url':      video_url,
993             'uploader': video_uploader,
994             'upload_date':  None,
995             'title':    video_title,
996             'ext':      video_extension.decode('utf-8'),
997             'thumbnail':    video_thumbnail.decode('utf-8'),
998             'description':  video_description,
999         }]
1000
1001
1002 class VimeoIE(InfoExtractor):
1003     """Information extractor for vimeo.com."""
1004
1005     # _VALID_URL matches Vimeo URLs
1006     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1007     IE_NAME = u'vimeo'
1008
1009     def __init__(self, downloader=None):
1010         InfoExtractor.__init__(self, downloader)
1011
1012     def report_download_webpage(self, video_id):
1013         """Report webpage download."""
1014         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1015
1016     def report_extraction(self, video_id):
1017         """Report information extraction."""
1018         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1019
1020     def _real_extract(self, url, new_video=True):
1021         # Extract ID from URL
1022         mobj = re.match(self._VALID_URL, url)
1023         if mobj is None:
1024             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1025             return
1026
1027         video_id = mobj.group('id')
1028         if not mobj.group('proto'):
1029             url = 'https://' + url
1030         if mobj.group('direct_link'):
1031             url = 'https://vimeo.com/' + video_id
1032
1033         # Retrieve video webpage to extract further information
1034         request = compat_urllib_request.Request(url, None, std_headers)
1035         try:
1036             self.report_download_webpage(video_id)
1037             webpage_bytes = compat_urllib_request.urlopen(request).read()
1038             webpage = webpage_bytes.decode('utf-8')
1039         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1040             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1041             return
1042
1043         # Now we begin extracting as much information as we can from what we
1044         # retrieved. First we extract the information common to all extractors,
1045         # and latter we extract those that are Vimeo specific.
1046         self.report_extraction(video_id)
1047
1048         # Extract the config JSON
1049         try:
1050             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1051             config = json.loads(config)
1052         except:
1053             self._downloader.trouble(u'ERROR: unable to extract info section')
1054             return
1055
1056         # Extract title
1057         video_title = config["video"]["title"]
1058
1059         # Extract uploader and uploader_id
1060         video_uploader = config["video"]["owner"]["name"]
1061         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1062
1063         # Extract video thumbnail
1064         video_thumbnail = config["video"]["thumbnail"]
1065
1066         # Extract video description
1067         video_description = get_element_by_attribute("itemprop", "description", webpage)
1068         if video_description: video_description = clean_html(video_description)
1069         else: video_description = ''
1070
1071         # Extract upload date
1072         video_upload_date = None
1073         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1074         if mobj is not None:
1075             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1076
1077         # Vimeo specific: extract request signature and timestamp
1078         sig = config['request']['signature']
1079         timestamp = config['request']['timestamp']
1080
1081         # Vimeo specific: extract video codec and quality information
1082         # First consider quality, then codecs, then take everything
1083         # TODO bind to format param
1084         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1085         files = { 'hd': [], 'sd': [], 'other': []}
1086         for codec_name, codec_extension in codecs:
1087             if codec_name in config["video"]["files"]:
1088                 if 'hd' in config["video"]["files"][codec_name]:
1089                     files['hd'].append((codec_name, codec_extension, 'hd'))
1090                 elif 'sd' in config["video"]["files"][codec_name]:
1091                     files['sd'].append((codec_name, codec_extension, 'sd'))
1092                 else:
1093                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1094
1095         for quality in ('hd', 'sd', 'other'):
1096             if len(files[quality]) > 0:
1097                 video_quality = files[quality][0][2]
1098                 video_codec = files[quality][0][0]
1099                 video_extension = files[quality][0][1]
1100                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1101                 break
1102         else:
1103             self._downloader.trouble(u'ERROR: no known codec found')
1104             return
1105
1106         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1107                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1108
1109         return [{
1110             'id':       video_id,
1111             'url':      video_url,
1112             'uploader': video_uploader,
1113             'uploader_id': video_uploader_id,
1114             'upload_date':  video_upload_date,
1115             'title':    video_title,
1116             'ext':      video_extension,
1117             'thumbnail':    video_thumbnail,
1118             'description':  video_description,
1119         }]
1120
1121
1122 class ArteTvIE(InfoExtractor):
1123     """arte.tv information extractor."""
1124
1125     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1126     _LIVE_URL = r'index-[0-9]+\.html$'
1127
1128     IE_NAME = u'arte.tv'
1129
1130     def __init__(self, downloader=None):
1131         InfoExtractor.__init__(self, downloader)
1132
1133     def report_download_webpage(self, video_id):
1134         """Report webpage download."""
1135         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1136
1137     def report_extraction(self, video_id):
1138         """Report information extraction."""
1139         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1140
1141     def fetch_webpage(self, url):
1142         request = compat_urllib_request.Request(url)
1143         try:
1144             self.report_download_webpage(url)
1145             webpage = compat_urllib_request.urlopen(request).read()
1146         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1147             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1148             return
1149         except ValueError as err:
1150             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1151             return
1152         return webpage
1153
1154     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1155         page = self.fetch_webpage(url)
1156         mobj = re.search(regex, page, regexFlags)
1157         info = {}
1158
1159         if mobj is None:
1160             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1161             return
1162
1163         for (i, key, err) in matchTuples:
1164             if mobj.group(i) is None:
1165                 self._downloader.trouble(err)
1166                 return
1167             else:
1168                 info[key] = mobj.group(i)
1169
1170         return info
1171
1172     def extractLiveStream(self, url):
1173         video_lang = url.split('/')[-4]
1174         info = self.grep_webpage(
1175             url,
1176             r'src="(.*?/videothek_js.*?\.js)',
1177             0,
1178             [
1179                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1180             ]
1181         )
1182         http_host = url.split('/')[2]
1183         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1184         info = self.grep_webpage(
1185             next_url,
1186             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1187                 '(http://.*?\.swf).*?' +
1188                 '(rtmp://.*?)\'',
1189             re.DOTALL,
1190             [
1191                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1192                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1193                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1194             ]
1195         )
1196         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1197
1198     def extractPlus7Stream(self, url):
1199         video_lang = url.split('/')[-3]
1200         info = self.grep_webpage(
1201             url,
1202             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1203             0,
1204             [
1205                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1206             ]
1207         )
1208         next_url = compat_urllib_parse.unquote(info.get('url'))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1212             0,
1213             [
1214                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1215             ]
1216         )
1217         next_url = compat_urllib_parse.unquote(info.get('url'))
1218
1219         info = self.grep_webpage(
1220             next_url,
1221             r'<video id="(.*?)".*?>.*?' +
1222                 '<name>(.*?)</name>.*?' +
1223                 '<dateVideo>(.*?)</dateVideo>.*?' +
1224                 '<url quality="hd">(.*?)</url>',
1225             re.DOTALL,
1226             [
1227                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1228                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1229                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1230                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1231             ]
1232         )
1233
1234         return {
1235             'id':           info.get('id'),
1236             'url':          compat_urllib_parse.unquote(info.get('url')),
1237             'uploader':     u'arte.tv',
1238             'upload_date':  info.get('date'),
1239             'title':        info.get('title').decode('utf-8'),
1240             'ext':          u'mp4',
1241             'format':       u'NA',
1242             'player_url':   None,
1243         }
1244
1245     def _real_extract(self, url):
1246         video_id = url.split('/')[-1]
1247         self.report_extraction(video_id)
1248
1249         if re.search(self._LIVE_URL, video_id) is not None:
1250             self.extractLiveStream(url)
1251             return
1252         else:
1253             info = self.extractPlus7Stream(url)
1254
1255         return [info]
1256
1257
1258 class GenericIE(InfoExtractor):
1259     """Generic last-resort information extractor."""
1260
1261     _VALID_URL = r'.*'
1262     IE_NAME = u'generic'
1263
1264     def __init__(self, downloader=None):
1265         InfoExtractor.__init__(self, downloader)
1266
1267     def report_download_webpage(self, video_id):
1268         """Report webpage download."""
1269         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1270         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1271
1272     def report_extraction(self, video_id):
1273         """Report information extraction."""
1274         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1275
1276     def report_following_redirect(self, new_url):
1277         """Report information extraction."""
1278         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1279
1280     def _test_redirect(self, url):
1281         """Check if it is a redirect, like url shorteners, in case restart chain."""
1282         class HeadRequest(compat_urllib_request.Request):
1283             def get_method(self):
1284                 return "HEAD"
1285
1286         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1287             """
1288             Subclass the HTTPRedirectHandler to make it use our
1289             HeadRequest also on the redirected URL
1290             """
1291             def redirect_request(self, req, fp, code, msg, headers, newurl):
1292                 if code in (301, 302, 303, 307):
1293                     newurl = newurl.replace(' ', '%20')
1294                     newheaders = dict((k,v) for k,v in req.headers.items()
1295                                       if k.lower() not in ("content-length", "content-type"))
1296                     return HeadRequest(newurl,
1297                                        headers=newheaders,
1298                                        origin_req_host=req.get_origin_req_host(),
1299                                        unverifiable=True)
1300                 else:
1301                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1302
1303         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1304             """
1305             Fallback to GET if HEAD is not allowed (405 HTTP error)
1306             """
1307             def http_error_405(self, req, fp, code, msg, headers):
1308                 fp.read()
1309                 fp.close()
1310
1311                 newheaders = dict((k,v) for k,v in req.headers.items()
1312                                   if k.lower() not in ("content-length", "content-type"))
1313                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1314                                                  headers=newheaders,
1315                                                  origin_req_host=req.get_origin_req_host(),
1316                                                  unverifiable=True))
1317
1318         # Build our opener
1319         opener = compat_urllib_request.OpenerDirector()
1320         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1321                         HTTPMethodFallback, HEADRedirectHandler,
1322                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1323             opener.add_handler(handler())
1324
1325         response = opener.open(HeadRequest(url))
1326         new_url = response.geturl()
1327
1328         if url == new_url:
1329             return False
1330
1331         self.report_following_redirect(new_url)
1332         self._downloader.download([new_url])
1333         return True
1334
1335     def _real_extract(self, url):
1336         if self._test_redirect(url): return
1337
1338         video_id = url.split('/')[-1]
1339         request = compat_urllib_request.Request(url)
1340         try:
1341             self.report_download_webpage(video_id)
1342             webpage = compat_urllib_request.urlopen(request).read()
1343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1344             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1345             return
1346         except ValueError as err:
1347             # since this is the last-resort InfoExtractor, if
1348             # this error is thrown, it'll be thrown here
1349             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1350             return
1351
1352         self.report_extraction(video_id)
1353         # Start with something easy: JW Player in SWFObject
1354         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1355         if mobj is None:
1356             # Broaden the search a little bit
1357             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1358         if mobj is None:
1359             # Broaden the search a little bit: JWPlayer JS loader
1360             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1361         if mobj is None:
1362             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1363             return
1364
1365         # It's possible that one of the regexes
1366         # matched, but returned an empty group:
1367         if mobj.group(1) is None:
1368             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1369             return
1370
1371         video_url = compat_urllib_parse.unquote(mobj.group(1))
1372         video_id = os.path.basename(video_url)
1373
1374         # here's a fun little line of code for you:
1375         video_extension = os.path.splitext(video_id)[1][1:]
1376         video_id = os.path.splitext(video_id)[0]
1377
1378         # it's tempting to parse this further, but you would
1379         # have to take into account all the variations like
1380         #   Video Title - Site Name
1381         #   Site Name | Video Title
1382         #   Video Title - Tagline | Site Name
1383         # and so on and so forth; it's just not practical
1384         mobj = re.search(r'<title>(.*)</title>', webpage)
1385         if mobj is None:
1386             self._downloader.trouble(u'ERROR: unable to extract title')
1387             return
1388         video_title = mobj.group(1)
1389
1390         # video uploader is domain name
1391         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1392         if mobj is None:
1393             self._downloader.trouble(u'ERROR: unable to extract title')
1394             return
1395         video_uploader = mobj.group(1)
1396
1397         return [{
1398             'id':       video_id,
1399             'url':      video_url,
1400             'uploader': video_uploader,
1401             'upload_date':  None,
1402             'title':    video_title,
1403             'ext':      video_extension,
1404         }]
1405
1406
1407 class YoutubeSearchIE(InfoExtractor):
1408     """Information Extractor for YouTube search queries."""
1409     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1410     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1411     _max_youtube_results = 1000
1412     IE_NAME = u'youtube:search'
1413
1414     def __init__(self, downloader=None):
1415         InfoExtractor.__init__(self, downloader)
1416
1417     def report_download_page(self, query, pagenum):
1418         """Report attempt to download search page with given number."""
1419         query = query.decode(preferredencoding())
1420         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1421
1422     def _real_extract(self, query):
1423         mobj = re.match(self._VALID_URL, query)
1424         if mobj is None:
1425             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1426             return
1427
1428         prefix, query = query.split(':')
1429         prefix = prefix[8:]
1430         query = query.encode('utf-8')
1431         if prefix == '':
1432             self._download_n_results(query, 1)
1433             return
1434         elif prefix == 'all':
1435             self._download_n_results(query, self._max_youtube_results)
1436             return
1437         else:
1438             try:
1439                 n = int(prefix)
1440                 if n <= 0:
1441                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1442                     return
1443                 elif n > self._max_youtube_results:
1444                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1445                     n = self._max_youtube_results
1446                 self._download_n_results(query, n)
1447                 return
1448             except ValueError: # parsing prefix as integer fails
1449                 self._download_n_results(query, 1)
1450                 return
1451
1452     def _download_n_results(self, query, n):
1453         """Downloads a specified number of results for a query"""
1454
1455         video_ids = []
1456         pagenum = 0
1457         limit = n
1458
1459         while (50 * pagenum) < limit:
1460             self.report_download_page(query, pagenum+1)
1461             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1462             request = compat_urllib_request.Request(result_url)
1463             try:
1464                 data = compat_urllib_request.urlopen(request).read()
1465             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1466                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1467                 return
1468             api_response = json.loads(data)['data']
1469
1470             new_ids = list(video['id'] for video in api_response['items'])
1471             video_ids += new_ids
1472
1473             limit = min(n, api_response['totalItems'])
1474             pagenum += 1
1475
1476         if len(video_ids) > n:
1477             video_ids = video_ids[:n]
1478         for id in video_ids:
1479             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1480         return
1481
1482
1483 class GoogleSearchIE(InfoExtractor):
1484     """Information Extractor for Google Video search queries."""
1485     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1486     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1487     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1488     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1489     _max_google_results = 1000
1490     IE_NAME = u'video.google:search'
1491
1492     def __init__(self, downloader=None):
1493         InfoExtractor.__init__(self, downloader)
1494
1495     def report_download_page(self, query, pagenum):
1496         """Report attempt to download playlist page with given number."""
1497         query = query.decode(preferredencoding())
1498         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1499
1500     def _real_extract(self, query):
1501         mobj = re.match(self._VALID_URL, query)
1502         if mobj is None:
1503             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1504             return
1505
1506         prefix, query = query.split(':')
1507         prefix = prefix[8:]
1508         query = query.encode('utf-8')
1509         if prefix == '':
1510             self._download_n_results(query, 1)
1511             return
1512         elif prefix == 'all':
1513             self._download_n_results(query, self._max_google_results)
1514             return
1515         else:
1516             try:
1517                 n = int(prefix)
1518                 if n <= 0:
1519                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1520                     return
1521                 elif n > self._max_google_results:
1522                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1523                     n = self._max_google_results
1524                 self._download_n_results(query, n)
1525                 return
1526             except ValueError: # parsing prefix as integer fails
1527                 self._download_n_results(query, 1)
1528                 return
1529
1530     def _download_n_results(self, query, n):
1531         """Downloads a specified number of results for a query"""
1532
1533         video_ids = []
1534         pagenum = 0
1535
1536         while True:
1537             self.report_download_page(query, pagenum)
1538             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1539             request = compat_urllib_request.Request(result_url)
1540             try:
1541                 page = compat_urllib_request.urlopen(request).read()
1542             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1543                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1544                 return
1545
1546             # Extract video identifiers
1547             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1548                 video_id = mobj.group(1)
1549                 if video_id not in video_ids:
1550                     video_ids.append(video_id)
1551                     if len(video_ids) == n:
1552                         # Specified n videos reached
1553                         for id in video_ids:
1554                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1555                         return
1556
1557             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1558                 for id in video_ids:
1559                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1560                 return
1561
1562             pagenum = pagenum + 1
1563
1564
1565 class YahooSearchIE(InfoExtractor):
1566     """Information Extractor for Yahoo! Video search queries."""
1567
1568     _WORKING = False
1569     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1570     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1571     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1572     _MORE_PAGES_INDICATOR = r'\s*Next'
1573     _max_yahoo_results = 1000
1574     IE_NAME = u'video.yahoo:search'
1575
1576     def __init__(self, downloader=None):
1577         InfoExtractor.__init__(self, downloader)
1578
1579     def report_download_page(self, query, pagenum):
1580         """Report attempt to download playlist page with given number."""
1581         query = query.decode(preferredencoding())
1582         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1583
1584     def _real_extract(self, query):
1585         mobj = re.match(self._VALID_URL, query)
1586         if mobj is None:
1587             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1588             return
1589
1590         prefix, query = query.split(':')
1591         prefix = prefix[8:]
1592         query = query.encode('utf-8')
1593         if prefix == '':
1594             self._download_n_results(query, 1)
1595             return
1596         elif prefix == 'all':
1597             self._download_n_results(query, self._max_yahoo_results)
1598             return
1599         else:
1600             try:
1601                 n = int(prefix)
1602                 if n <= 0:
1603                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1604                     return
1605                 elif n > self._max_yahoo_results:
1606                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1607                     n = self._max_yahoo_results
1608                 self._download_n_results(query, n)
1609                 return
1610             except ValueError: # parsing prefix as integer fails
1611                 self._download_n_results(query, 1)
1612                 return
1613
1614     def _download_n_results(self, query, n):
1615         """Downloads a specified number of results for a query"""
1616
1617         video_ids = []
1618         already_seen = set()
1619         pagenum = 1
1620
1621         while True:
1622             self.report_download_page(query, pagenum)
1623             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1624             request = compat_urllib_request.Request(result_url)
1625             try:
1626                 page = compat_urllib_request.urlopen(request).read()
1627             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1629                 return
1630
1631             # Extract video identifiers
1632             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633                 video_id = mobj.group(1)
1634                 if video_id not in already_seen:
1635                     video_ids.append(video_id)
1636                     already_seen.add(video_id)
1637                     if len(video_ids) == n:
1638                         # Specified n videos reached
1639                         for id in video_ids:
1640                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1641                         return
1642
1643             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1644                 for id in video_ids:
1645                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1646                 return
1647
1648             pagenum = pagenum + 1
1649
1650
1651 class YoutubePlaylistIE(InfoExtractor):
1652     """Information Extractor for YouTube playlists."""
1653
1654     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1655     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1656     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1657     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1658     IE_NAME = u'youtube:playlist'
1659
1660     def __init__(self, downloader=None):
1661         InfoExtractor.__init__(self, downloader)
1662
1663     def report_download_page(self, playlist_id, pagenum):
1664         """Report attempt to download playlist page with given number."""
1665         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1666
1667     def _real_extract(self, url):
1668         # Extract playlist id
1669         mobj = re.match(self._VALID_URL, url)
1670         if mobj is None:
1671             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1672             return
1673
1674         # Single video case
1675         if mobj.group(3) is not None:
1676             self._downloader.download([mobj.group(3)])
1677             return
1678
1679         # Download playlist pages
1680         # prefix is 'p' as default for playlists but there are other types that need extra care
1681         playlist_prefix = mobj.group(1)
1682         if playlist_prefix == 'a':
1683             playlist_access = 'artist'
1684         else:
1685             playlist_prefix = 'p'
1686             playlist_access = 'view_play_list'
1687         playlist_id = mobj.group(2)
1688         video_ids = []
1689         pagenum = 1
1690
1691         while True:
1692             self.report_download_page(playlist_id, pagenum)
1693             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1694             request = compat_urllib_request.Request(url)
1695             try:
1696                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1697             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1698                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1699                 return
1700
1701             # Extract video identifiers
1702             ids_in_page = []
1703             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1704                 if mobj.group(1) not in ids_in_page:
1705                     ids_in_page.append(mobj.group(1))
1706             video_ids.extend(ids_in_page)
1707
1708             if self._MORE_PAGES_INDICATOR not in page:
1709                 break
1710             pagenum = pagenum + 1
1711
1712         total = len(video_ids)
1713
1714         playliststart = self._downloader.params.get('playliststart', 1) - 1
1715         playlistend = self._downloader.params.get('playlistend', -1)
1716         if playlistend == -1:
1717             video_ids = video_ids[playliststart:]
1718         else:
1719             video_ids = video_ids[playliststart:playlistend]
1720
1721         if len(video_ids) == total:
1722             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1723         else:
1724             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1725
1726         for id in video_ids:
1727             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1728         return
1729
1730
1731 class YoutubeChannelIE(InfoExtractor):
1732     """Information Extractor for YouTube channels."""
1733
1734     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1735     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1736     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1737     IE_NAME = u'youtube:channel'
1738
1739     def report_download_page(self, channel_id, pagenum):
1740         """Report attempt to download channel page with given number."""
1741         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1742
1743     def _real_extract(self, url):
1744         # Extract channel id
1745         mobj = re.match(self._VALID_URL, url)
1746         if mobj is None:
1747             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1748             return
1749
1750         # Download channel pages
1751         channel_id = mobj.group(1)
1752         video_ids = []
1753         pagenum = 1
1754
1755         while True:
1756             self.report_download_page(channel_id, pagenum)
1757             url = self._TEMPLATE_URL % (channel_id, pagenum)
1758             request = compat_urllib_request.Request(url)
1759             try:
1760                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1761             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1762                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1763                 return
1764
1765             # Extract video identifiers
1766             ids_in_page = []
1767             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1768                 if mobj.group(1) not in ids_in_page:
1769                     ids_in_page.append(mobj.group(1))
1770             video_ids.extend(ids_in_page)
1771
1772             if self._MORE_PAGES_INDICATOR not in page:
1773                 break
1774             pagenum = pagenum + 1
1775
1776         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1777
1778         for id in video_ids:
1779             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1780         return
1781
1782
1783 class YoutubeUserIE(InfoExtractor):
1784     """Information Extractor for YouTube users."""
1785
1786     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1787     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1788     _GDATA_PAGE_SIZE = 50
1789     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1790     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1791     IE_NAME = u'youtube:user'
1792
1793     def __init__(self, downloader=None):
1794         InfoExtractor.__init__(self, downloader)
1795
1796     def report_download_page(self, username, start_index):
1797         """Report attempt to download user page."""
1798         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1799                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1800
1801     def _real_extract(self, url):
1802         # Extract username
1803         mobj = re.match(self._VALID_URL, url)
1804         if mobj is None:
1805             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1806             return
1807
1808         username = mobj.group(1)
1809
1810         # Download video ids using YouTube Data API. Result size per
1811         # query is limited (currently to 50 videos) so we need to query
1812         # page by page until there are no video ids - it means we got
1813         # all of them.
1814
1815         video_ids = []
1816         pagenum = 0
1817
1818         while True:
1819             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1820             self.report_download_page(username, start_index)
1821
1822             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1823
1824             try:
1825                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1826             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1827                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1828                 return
1829
1830             # Extract video identifiers
1831             ids_in_page = []
1832
1833             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1834                 if mobj.group(1) not in ids_in_page:
1835                     ids_in_page.append(mobj.group(1))
1836
1837             video_ids.extend(ids_in_page)
1838
1839             # A little optimization - if current page is not
1840             # "full", ie. does not contain PAGE_SIZE video ids then
1841             # we can assume that this page is the last one - there
1842             # are no more ids on further pages - no need to query
1843             # again.
1844
1845             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1846                 break
1847
1848             pagenum += 1
1849
1850         all_ids_count = len(video_ids)
1851         playliststart = self._downloader.params.get('playliststart', 1) - 1
1852         playlistend = self._downloader.params.get('playlistend', -1)
1853
1854         if playlistend == -1:
1855             video_ids = video_ids[playliststart:]
1856         else:
1857             video_ids = video_ids[playliststart:playlistend]
1858
1859         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1860                 (username, all_ids_count, len(video_ids)))
1861
1862         for video_id in video_ids:
1863             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1864
1865
1866 class BlipTVUserIE(InfoExtractor):
1867     """Information Extractor for blip.tv users."""
1868
1869     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1870     _PAGE_SIZE = 12
1871     IE_NAME = u'blip.tv:user'
1872
1873     def __init__(self, downloader=None):
1874         InfoExtractor.__init__(self, downloader)
1875
1876     def report_download_page(self, username, pagenum):
1877         """Report attempt to download user page."""
1878         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1879                 (self.IE_NAME, username, pagenum))
1880
1881     def _real_extract(self, url):
1882         # Extract username
1883         mobj = re.match(self._VALID_URL, url)
1884         if mobj is None:
1885             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1886             return
1887
1888         username = mobj.group(1)
1889
1890         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1891
1892         request = compat_urllib_request.Request(url)
1893
1894         try:
1895             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1896             mobj = re.search(r'data-users-id="([^"]+)"', page)
1897             page_base = page_base % mobj.group(1)
1898         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1900             return
1901
1902
1903         # Download video ids using BlipTV Ajax calls. Result size per
1904         # query is limited (currently to 12 videos) so we need to query
1905         # page by page until there are no video ids - it means we got
1906         # all of them.
1907
1908         video_ids = []
1909         pagenum = 1
1910
1911         while True:
1912             self.report_download_page(username, pagenum)
1913
1914             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1915
1916             try:
1917                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1920                 return
1921
1922             # Extract video identifiers
1923             ids_in_page = []
1924
1925             for mobj in re.finditer(r'href="/([^"]+)"', page):
1926                 if mobj.group(1) not in ids_in_page:
1927                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1928
1929             video_ids.extend(ids_in_page)
1930
1931             # A little optimization - if current page is not
1932             # "full", ie. does not contain PAGE_SIZE video ids then
1933             # we can assume that this page is the last one - there
1934             # are no more ids on further pages - no need to query
1935             # again.
1936
1937             if len(ids_in_page) < self._PAGE_SIZE:
1938                 break
1939
1940             pagenum += 1
1941
1942         all_ids_count = len(video_ids)
1943         playliststart = self._downloader.params.get('playliststart', 1) - 1
1944         playlistend = self._downloader.params.get('playlistend', -1)
1945
1946         if playlistend == -1:
1947             video_ids = video_ids[playliststart:]
1948         else:
1949             video_ids = video_ids[playliststart:playlistend]
1950
1951         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1952                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1953
1954         for video_id in video_ids:
1955             self._downloader.download([u'http://blip.tv/'+video_id])
1956
1957
1958 class DepositFilesIE(InfoExtractor):
1959     """Information extractor for depositfiles.com"""
1960
1961     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1962
1963     def report_download_webpage(self, file_id):
1964         """Report webpage download."""
1965         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1966
1967     def report_extraction(self, file_id):
1968         """Report information extraction."""
1969         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1970
1971     def _real_extract(self, url):
1972         file_id = url.split('/')[-1]
1973         # Rebuild url in english locale
1974         url = 'http://depositfiles.com/en/files/' + file_id
1975
1976         # Retrieve file webpage with 'Free download' button pressed
1977         free_download_indication = { 'gateway_result' : '1' }
1978         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1979         try:
1980             self.report_download_webpage(file_id)
1981             webpage = compat_urllib_request.urlopen(request).read()
1982         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1984             return
1985
1986         # Search for the real file URL
1987         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1988         if (mobj is None) or (mobj.group(1) is None):
1989             # Try to figure out reason of the error.
1990             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1991             if (mobj is not None) and (mobj.group(1) is not None):
1992                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1993                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1994             else:
1995                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1996             return
1997
1998         file_url = mobj.group(1)
1999         file_extension = os.path.splitext(file_url)[1][1:]
2000
2001         # Search for file title
2002         mobj = re.search(r'<b title="(.*?)">', webpage)
2003         if mobj is None:
2004             self._downloader.trouble(u'ERROR: unable to extract title')
2005             return
2006         file_title = mobj.group(1).decode('utf-8')
2007
2008         return [{
2009             'id':       file_id.decode('utf-8'),
2010             'url':      file_url.decode('utf-8'),
2011             'uploader': None,
2012             'upload_date':  None,
2013             'title':    file_title,
2014             'ext':      file_extension.decode('utf-8'),
2015         }]
2016
2017
2018 class FacebookIE(InfoExtractor):
2019     """Information Extractor for Facebook"""
2020
2021     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2022     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2023     _NETRC_MACHINE = 'facebook'
2024     IE_NAME = u'facebook'
2025
2026     def report_login(self):
2027         """Report attempt to log in."""
2028         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2029
2030     def _real_initialize(self):
2031         if self._downloader is None:
2032             return
2033
2034         useremail = None
2035         password = None
2036         downloader_params = self._downloader.params
2037
2038         # Attempt to use provided username and password or .netrc data
2039         if downloader_params.get('username', None) is not None:
2040             useremail = downloader_params['username']
2041             password = downloader_params['password']
2042         elif downloader_params.get('usenetrc', False):
2043             try:
2044                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2045                 if info is not None:
2046                     useremail = info[0]
2047                     password = info[2]
2048                 else:
2049                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2050             except (IOError, netrc.NetrcParseError) as err:
2051                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2052                 return
2053
2054         if useremail is None:
2055             return
2056
2057         # Log in
2058         login_form = {
2059             'email': useremail,
2060             'pass': password,
2061             'login': 'Log+In'
2062             }
2063         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2064         try:
2065             self.report_login()
2066             login_results = compat_urllib_request.urlopen(request).read()
2067             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2068                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2069                 return
2070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2072             return
2073
2074     def _real_extract(self, url):
2075         mobj = re.match(self._VALID_URL, url)
2076         if mobj is None:
2077             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2078             return
2079         video_id = mobj.group('ID')
2080
2081         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2082         webpage = self._download_webpage(url, video_id)
2083
2084         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2085         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2086         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2087         if not m:
2088             raise ExtractorError(u'Cannot parse data')
2089         data = dict(json.loads(m.group(1)))
2090         params_raw = compat_urllib_parse.unquote(data['params'])
2091         params = json.loads(params_raw)
2092         video_url = params['hd_src']
2093         if not video_url:
2094             video_url = params['sd_src']
2095         if not video_url:
2096             raise ExtractorError(u'Cannot find video URL')
2097         video_duration = int(params['video_duration'])
2098
2099         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2100         if not m:
2101             raise ExtractorError(u'Cannot find title in webpage')
2102         video_title = unescapeHTML(m.group(1))
2103
2104         info = {
2105             'id': video_id,
2106             'title': video_title,
2107             'url': video_url,
2108             'ext': 'mp4',
2109             'duration': video_duration,
2110             'thumbnail': params['thumbnail_src'],
2111         }
2112         return [info]
2113
2114
2115 class BlipTVIE(InfoExtractor):
2116     """Information extractor for blip.tv"""
2117
2118     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2119     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2120     IE_NAME = u'blip.tv'
2121
2122     def report_extraction(self, file_id):
2123         """Report information extraction."""
2124         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2125
2126     def report_direct_download(self, title):
2127         """Report information extraction."""
2128         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2129
2130     def _real_extract(self, url):
2131         mobj = re.match(self._VALID_URL, url)
2132         if mobj is None:
2133             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2134             return
2135
2136         if '?' in url:
2137             cchar = '&'
2138         else:
2139             cchar = '?'
2140         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2141         request = compat_urllib_request.Request(json_url)
2142         request.add_header('User-Agent', 'iTunes/10.6.1')
2143         self.report_extraction(mobj.group(1))
2144         info = None
2145         try:
2146             urlh = compat_urllib_request.urlopen(request)
2147             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2148                 basename = url.split('/')[-1]
2149                 title,ext = os.path.splitext(basename)
2150                 title = title.decode('UTF-8')
2151                 ext = ext.replace('.', '')
2152                 self.report_direct_download(title)
2153                 info = {
2154                     'id': title,
2155                     'url': url,
2156                     'uploader': None,
2157                     'upload_date': None,
2158                     'title': title,
2159                     'ext': ext,
2160                     'urlhandle': urlh
2161                 }
2162         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2163             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2164         if info is None: # Regular URL
2165             try:
2166                 json_code_bytes = urlh.read()
2167                 json_code = json_code_bytes.decode('utf-8')
2168             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2169                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2170                 return
2171
2172             try:
2173                 json_data = json.loads(json_code)
2174                 if 'Post' in json_data:
2175                     data = json_data['Post']
2176                 else:
2177                     data = json_data
2178
2179                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2180                 video_url = data['media']['url']
2181                 umobj = re.match(self._URL_EXT, video_url)
2182                 if umobj is None:
2183                     raise ValueError('Can not determine filename extension')
2184                 ext = umobj.group(1)
2185
2186                 info = {
2187                     'id': data['item_id'],
2188                     'url': video_url,
2189                     'uploader': data['display_name'],
2190                     'upload_date': upload_date,
2191                     'title': data['title'],
2192                     'ext': ext,
2193                     'format': data['media']['mimeType'],
2194                     'thumbnail': data['thumbnailUrl'],
2195                     'description': data['description'],
2196                     'player_url': data['embedUrl'],
2197                     'user_agent': 'iTunes/10.6.1',
2198                 }
2199             except (ValueError,KeyError) as err:
2200                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2201                 return
2202
2203         return [info]
2204
2205
2206 class MyVideoIE(InfoExtractor):
2207     """Information Extractor for myvideo.de."""
2208
2209     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2210     IE_NAME = u'myvideo'
2211
2212     def __init__(self, downloader=None):
2213         InfoExtractor.__init__(self, downloader)
2214
2215     def report_extraction(self, video_id):
2216         """Report information extraction."""
2217         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2218
2219     def _real_extract(self,url):
2220         mobj = re.match(self._VALID_URL, url)
2221         if mobj is None:
2222             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2223             return
2224
2225         video_id = mobj.group(1)
2226
2227         # Get video webpage
2228         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2229         webpage = self._download_webpage(webpage_url, video_id)
2230
2231         self.report_extraction(video_id)
2232         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2233                  webpage)
2234         if mobj is None:
2235             self._downloader.trouble(u'ERROR: unable to extract media URL')
2236             return
2237         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2238
2239         mobj = re.search('<title>([^<]+)</title>', webpage)
2240         if mobj is None:
2241             self._downloader.trouble(u'ERROR: unable to extract title')
2242             return
2243
2244         video_title = mobj.group(1)
2245
2246         return [{
2247             'id':       video_id,
2248             'url':      video_url,
2249             'uploader': None,
2250             'upload_date':  None,
2251             'title':    video_title,
2252             'ext':      u'flv',
2253         }]
2254
2255 class ComedyCentralIE(InfoExtractor):
2256     """Information extractor for The Daily Show and Colbert Report """
2257
2258     # urls can be abbreviations like :thedailyshow or :colbert
2259     # urls for episodes like:
2260     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2261     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2262     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2263     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2264                       |(https?://)?(www\.)?
2265                           (?P<showname>thedailyshow|colbertnation)\.com/
2266                          (full-episodes/(?P<episode>.*)|
2267                           (?P<clip>
2268                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2269                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2270                      $"""
2271
2272     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2273
2274     _video_extensions = {
2275         '3500': 'mp4',
2276         '2200': 'mp4',
2277         '1700': 'mp4',
2278         '1200': 'mp4',
2279         '750': 'mp4',
2280         '400': 'mp4',
2281     }
2282     _video_dimensions = {
2283         '3500': '1280x720',
2284         '2200': '960x540',
2285         '1700': '768x432',
2286         '1200': '640x360',
2287         '750': '512x288',
2288         '400': '384x216',
2289     }
2290
2291     def suitable(self, url):
2292         """Receives a URL and returns True if suitable for this IE."""
2293         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2294
2295     def report_extraction(self, episode_id):
2296         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2297
2298     def report_config_download(self, episode_id, media_id):
2299         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2300
2301     def report_index_download(self, episode_id):
2302         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2303
2304     def _print_formats(self, formats):
2305         print('Available formats:')
2306         for x in formats:
2307             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2308
2309
2310     def _real_extract(self, url):
2311         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2312         if mobj is None:
2313             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2314             return
2315
2316         if mobj.group('shortname'):
2317             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318                 url = u'http://www.thedailyshow.com/full-episodes/'
2319             else:
2320                 url = u'http://www.colbertnation.com/full-episodes/'
2321             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2322             assert mobj is not None
2323
2324         if mobj.group('clip'):
2325             if mobj.group('showname') == 'thedailyshow':
2326                 epTitle = mobj.group('tdstitle')
2327             else:
2328                 epTitle = mobj.group('cntitle')
2329             dlNewest = False
2330         else:
2331             dlNewest = not mobj.group('episode')
2332             if dlNewest:
2333                 epTitle = mobj.group('showname')
2334             else:
2335                 epTitle = mobj.group('episode')
2336
2337         req = compat_urllib_request.Request(url)
2338         self.report_extraction(epTitle)
2339         try:
2340             htmlHandle = compat_urllib_request.urlopen(req)
2341             html = htmlHandle.read()
2342             webpage = html.decode('utf-8')
2343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2344             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2345             return
2346         if dlNewest:
2347             url = htmlHandle.geturl()
2348             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2349             if mobj is None:
2350                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2351                 return
2352             if mobj.group('episode') == '':
2353                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2354                 return
2355             epTitle = mobj.group('episode')
2356
2357         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2358
2359         if len(mMovieParams) == 0:
2360             # The Colbert Report embeds the information in a without
2361             # a URL prefix; so extract the alternate reference
2362             # and then add the URL prefix manually.
2363
2364             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2365             if len(altMovieParams) == 0:
2366                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2367                 return
2368             else:
2369                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2370
2371         uri = mMovieParams[0][1]
2372         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2373         self.report_index_download(epTitle)
2374         try:
2375             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2377             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2378             return
2379
2380         results = []
2381
2382         idoc = xml.etree.ElementTree.fromstring(indexXml)
2383         itemEls = idoc.findall('.//item')
2384         for partNum,itemEl in enumerate(itemEls):
2385             mediaId = itemEl.findall('./guid')[0].text
2386             shortMediaId = mediaId.split(':')[-1]
2387             showId = mediaId.split(':')[-2].replace('.com', '')
2388             officialTitle = itemEl.findall('./title')[0].text
2389             officialDate = itemEl.findall('./pubDate')[0].text
2390
2391             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2392                         compat_urllib_parse.urlencode({'uri': mediaId}))
2393             configReq = compat_urllib_request.Request(configUrl)
2394             self.report_config_download(epTitle, shortMediaId)
2395             try:
2396                 configXml = compat_urllib_request.urlopen(configReq).read()
2397             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2398                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2399                 return
2400
2401             cdoc = xml.etree.ElementTree.fromstring(configXml)
2402             turls = []
2403             for rendition in cdoc.findall('.//rendition'):
2404                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2405                 turls.append(finfo)
2406
2407             if len(turls) == 0:
2408                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2409                 continue
2410
2411             if self._downloader.params.get('listformats', None):
2412                 self._print_formats([i[0] for i in turls])
2413                 return
2414
2415             # For now, just pick the highest bitrate
2416             format,rtmp_video_url = turls[-1]
2417
2418             # Get the format arg from the arg stream
2419             req_format = self._downloader.params.get('format', None)
2420
2421             # Select format if we can find one
2422             for f,v in turls:
2423                 if f == req_format:
2424                     format, rtmp_video_url = f, v
2425                     break
2426
2427             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2428             if not m:
2429                 raise ExtractorError(u'Cannot transform RTMP url')
2430             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2431             video_url = base + m.group('finalid')
2432
2433             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2434             info = {
2435                 'id': shortMediaId,
2436                 'url': video_url,
2437                 'uploader': showId,
2438                 'upload_date': officialDate,
2439                 'title': effTitle,
2440                 'ext': 'mp4',
2441                 'format': format,
2442                 'thumbnail': None,
2443                 'description': officialTitle,
2444             }
2445             results.append(info)
2446
2447         return results
2448
2449
2450 class EscapistIE(InfoExtractor):
2451     """Information extractor for The Escapist """
2452
2453     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2454     IE_NAME = u'escapist'
2455
2456     def report_extraction(self, showName):
2457         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2458
2459     def report_config_download(self, showName):
2460         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2461
2462     def _real_extract(self, url):
2463         mobj = re.match(self._VALID_URL, url)
2464         if mobj is None:
2465             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2466             return
2467         showName = mobj.group('showname')
2468         videoId = mobj.group('episode')
2469
2470         self.report_extraction(showName)
2471         try:
2472             webPage = compat_urllib_request.urlopen(url)
2473             webPageBytes = webPage.read()
2474             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2475             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2476         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2477             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2478             return
2479
2480         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2481         description = unescapeHTML(descMatch.group(1))
2482         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2483         imgUrl = unescapeHTML(imgMatch.group(1))
2484         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2485         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2486         configUrlMatch = re.search('config=(.*)$', playerUrl)
2487         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2488
2489         self.report_config_download(showName)
2490         try:
2491             configJSON = compat_urllib_request.urlopen(configUrl)
2492             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2493             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2495             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2496             return
2497
2498         # Technically, it's JavaScript, not JSON
2499         configJSON = configJSON.replace("'", '"')
2500
2501         try:
2502             config = json.loads(configJSON)
2503         except (ValueError,) as err:
2504             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2505             return
2506
2507         playlist = config['playlist']
2508         videoUrl = playlist[1]['url']
2509
2510         info = {
2511             'id': videoId,
2512             'url': videoUrl,
2513             'uploader': showName,
2514             'upload_date': None,
2515             'title': showName,
2516             'ext': 'flv',
2517             'thumbnail': imgUrl,
2518             'description': description,
2519             'player_url': playerUrl,
2520         }
2521
2522         return [info]
2523
2524 class CollegeHumorIE(InfoExtractor):
2525     """Information extractor for collegehumor.com"""
2526
2527     _WORKING = False
2528     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2529     IE_NAME = u'collegehumor'
2530
2531     def report_manifest(self, video_id):
2532         """Report information extraction."""
2533         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2534
2535     def report_extraction(self, video_id):
2536         """Report information extraction."""
2537         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2538
2539     def _real_extract(self, url):
2540         mobj = re.match(self._VALID_URL, url)
2541         if mobj is None:
2542             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2543             return
2544         video_id = mobj.group('videoid')
2545
2546         info = {
2547             'id': video_id,
2548             'uploader': None,
2549             'upload_date': None,
2550         }
2551
2552         self.report_extraction(video_id)
2553         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2554         try:
2555             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2556         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2557             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2558             return
2559
2560         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2561         try:
2562             videoNode = mdoc.findall('./video')[0]
2563             info['description'] = videoNode.findall('./description')[0].text
2564             info['title'] = videoNode.findall('./caption')[0].text
2565             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2566             manifest_url = videoNode.findall('./file')[0].text
2567         except IndexError:
2568             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2569             return
2570
2571         manifest_url += '?hdcore=2.10.3'
2572         self.report_manifest(video_id)
2573         try:
2574             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2576             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2577             return
2578
2579         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2580         try:
2581             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2582             node_id = media_node.attrib['url']
2583             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2584         except IndexError as err:
2585             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2586             return
2587
2588         url_pr = compat_urllib_parse_urlparse(manifest_url)
2589         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2590
2591         info['url'] = url
2592         info['ext'] = 'f4f'
2593         return [info]
2594
2595
2596 class XVideosIE(InfoExtractor):
2597     """Information extractor for xvideos.com"""
2598
2599     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2600     IE_NAME = u'xvideos'
2601
2602     def report_extraction(self, video_id):
2603         """Report information extraction."""
2604         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2605
2606     def _real_extract(self, url):
2607         mobj = re.match(self._VALID_URL, url)
2608         if mobj is None:
2609             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2610             return
2611         video_id = mobj.group(1)
2612
2613         webpage = self._download_webpage(url, video_id)
2614
2615         self.report_extraction(video_id)
2616
2617
2618         # Extract video URL
2619         mobj = re.search(r'flv_url=(.+?)&', webpage)
2620         if mobj is None:
2621             self._downloader.trouble(u'ERROR: unable to extract video url')
2622             return
2623         video_url = compat_urllib_parse.unquote(mobj.group(1))
2624
2625
2626         # Extract title
2627         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2628         if mobj is None:
2629             self._downloader.trouble(u'ERROR: unable to extract video title')
2630             return
2631         video_title = mobj.group(1)
2632
2633
2634         # Extract video thumbnail
2635         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2636         if mobj is None:
2637             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2638             return
2639         video_thumbnail = mobj.group(0)
2640
2641         info = {
2642             'id': video_id,
2643             'url': video_url,
2644             'uploader': None,
2645             'upload_date': None,
2646             'title': video_title,
2647             'ext': 'flv',
2648             'thumbnail': video_thumbnail,
2649             'description': None,
2650         }
2651
2652         return [info]
2653
2654
2655 class SoundcloudIE(InfoExtractor):
2656     """Information extractor for soundcloud.com
2657        To access the media, the uid of the song and a stream token
2658        must be extracted from the page source and the script must make
2659        a request to media.soundcloud.com/crossdomain.xml. Then
2660        the media can be grabbed by requesting from an url composed
2661        of the stream token and uid
2662      """
2663
2664     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2665     IE_NAME = u'soundcloud'
2666
2667     def __init__(self, downloader=None):
2668         InfoExtractor.__init__(self, downloader)
2669
2670     def report_resolve(self, video_id):
2671         """Report information extraction."""
2672         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2673
2674     def report_extraction(self, video_id):
2675         """Report information extraction."""
2676         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2677
2678     def _real_extract(self, url):
2679         mobj = re.match(self._VALID_URL, url)
2680         if mobj is None:
2681             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2682             return
2683
2684         # extract uploader (which is in the url)
2685         uploader = mobj.group(1)
2686         # extract simple title (uploader + slug of song title)
2687         slug_title =  mobj.group(2)
2688         simple_title = uploader + u'-' + slug_title
2689
2690         self.report_resolve('%s/%s' % (uploader, slug_title))
2691
2692         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2693         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2694         request = compat_urllib_request.Request(resolv_url)
2695         try:
2696             info_json_bytes = compat_urllib_request.urlopen(request).read()
2697             info_json = info_json_bytes.decode('utf-8')
2698         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2699             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2700             return
2701
2702         info = json.loads(info_json)
2703         video_id = info['id']
2704         self.report_extraction('%s/%s' % (uploader, slug_title))
2705
2706         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2707         request = compat_urllib_request.Request(streams_url)
2708         try:
2709             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2710             stream_json = stream_json_bytes.decode('utf-8')
2711         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2712             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2713             return
2714
2715         streams = json.loads(stream_json)
2716         mediaURL = streams['http_mp3_128_url']
2717
2718         return [{
2719             'id':       info['id'],
2720             'url':      mediaURL,
2721             'uploader': info['user']['username'],
2722             'upload_date':  info['created_at'],
2723             'title':    info['title'],
2724             'ext':      u'mp3',
2725             'description': info['description'],
2726         }]
2727
2728
2729 class InfoQIE(InfoExtractor):
2730     """Information extractor for infoq.com"""
2731     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2732
2733     def report_extraction(self, video_id):
2734         """Report information extraction."""
2735         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2736
2737     def _real_extract(self, url):
2738         mobj = re.match(self._VALID_URL, url)
2739         if mobj is None:
2740             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2741             return
2742
2743         webpage = self._download_webpage(url, video_id=url)
2744         self.report_extraction(url)
2745
2746         # Extract video URL
2747         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2748         if mobj is None:
2749             self._downloader.trouble(u'ERROR: unable to extract video url')
2750             return
2751         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2752         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2753
2754         # Extract title
2755         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2756         if mobj is None:
2757             self._downloader.trouble(u'ERROR: unable to extract video title')
2758             return
2759         video_title = mobj.group(1)
2760
2761         # Extract description
2762         video_description = u'No description available.'
2763         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2764         if mobj is not None:
2765             video_description = mobj.group(1)
2766
2767         video_filename = video_url.split('/')[-1]
2768         video_id, extension = video_filename.split('.')
2769
2770         info = {
2771             'id': video_id,
2772             'url': video_url,
2773             'uploader': None,
2774             'upload_date': None,
2775             'title': video_title,
2776             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2777             'thumbnail': None,
2778             'description': video_description,
2779         }
2780
2781         return [info]
2782
2783 class MixcloudIE(InfoExtractor):
2784     """Information extractor for www.mixcloud.com"""
2785
2786     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2787     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2788     IE_NAME = u'mixcloud'
2789
2790     def __init__(self, downloader=None):
2791         InfoExtractor.__init__(self, downloader)
2792
2793     def report_download_json(self, file_id):
2794         """Report JSON download."""
2795         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2796
2797     def report_extraction(self, file_id):
2798         """Report information extraction."""
2799         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2800
2801     def get_urls(self, jsonData, fmt, bitrate='best'):
2802         """Get urls from 'audio_formats' section in json"""
2803         file_url = None
2804         try:
2805             bitrate_list = jsonData[fmt]
2806             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2807                 bitrate = max(bitrate_list) # select highest
2808
2809             url_list = jsonData[fmt][bitrate]
2810         except TypeError: # we have no bitrate info.
2811             url_list = jsonData[fmt]
2812         return url_list
2813
2814     def check_urls(self, url_list):
2815         """Returns 1st active url from list"""
2816         for url in url_list:
2817             try:
2818                 compat_urllib_request.urlopen(url)
2819                 return url
2820             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2821                 url = None
2822
2823         return None
2824
2825     def _print_formats(self, formats):
2826         print('Available formats:')
2827         for fmt in formats.keys():
2828             for b in formats[fmt]:
2829                 try:
2830                     ext = formats[fmt][b][0]
2831                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2832                 except TypeError: # we have no bitrate info
2833                     ext = formats[fmt][0]
2834                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2835                     break
2836
2837     def _real_extract(self, url):
2838         mobj = re.match(self._VALID_URL, url)
2839         if mobj is None:
2840             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2841             return
2842         # extract uploader & filename from url
2843         uploader = mobj.group(1).decode('utf-8')
2844         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2845
2846         # construct API request
2847         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2848         # retrieve .json file with links to files
2849         request = compat_urllib_request.Request(file_url)
2850         try:
2851             self.report_download_json(file_url)
2852             jsonData = compat_urllib_request.urlopen(request).read()
2853         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2854             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2855             return
2856
2857         # parse JSON
2858         json_data = json.loads(jsonData)
2859         player_url = json_data['player_swf_url']
2860         formats = dict(json_data['audio_formats'])
2861
2862         req_format = self._downloader.params.get('format', None)
2863         bitrate = None
2864
2865         if self._downloader.params.get('listformats', None):
2866             self._print_formats(formats)
2867             return
2868
2869         if req_format is None or req_format == 'best':
2870             for format_param in formats.keys():
2871                 url_list = self.get_urls(formats, format_param)
2872                 # check urls
2873                 file_url = self.check_urls(url_list)
2874                 if file_url is not None:
2875                     break # got it!
2876         else:
2877             if req_format not in formats:
2878                 self._downloader.trouble(u'ERROR: format is not available')
2879                 return
2880
2881             url_list = self.get_urls(formats, req_format)
2882             file_url = self.check_urls(url_list)
2883             format_param = req_format
2884
2885         return [{
2886             'id': file_id.decode('utf-8'),
2887             'url': file_url.decode('utf-8'),
2888             'uploader': uploader.decode('utf-8'),
2889             'upload_date': None,
2890             'title': json_data['name'],
2891             'ext': file_url.split('.')[-1].decode('utf-8'),
2892             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2893             'thumbnail': json_data['thumbnail_url'],
2894             'description': json_data['description'],
2895             'player_url': player_url.decode('utf-8'),
2896         }]
2897
2898 class StanfordOpenClassroomIE(InfoExtractor):
2899     """Information extractor for Stanford's Open ClassRoom"""
2900
2901     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2902     IE_NAME = u'stanfordoc'
2903
2904     def report_download_webpage(self, objid):
2905         """Report information extraction."""
2906         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2907
2908     def report_extraction(self, video_id):
2909         """Report information extraction."""
2910         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2911
2912     def _real_extract(self, url):
2913         mobj = re.match(self._VALID_URL, url)
2914         if mobj is None:
2915             raise ExtractorError(u'Invalid URL: %s' % url)
2916
2917         if mobj.group('course') and mobj.group('video'): # A specific video
2918             course = mobj.group('course')
2919             video = mobj.group('video')
2920             info = {
2921                 'id': course + '_' + video,
2922                 'uploader': None,
2923                 'upload_date': None,
2924             }
2925
2926             self.report_extraction(info['id'])
2927             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2928             xmlUrl = baseUrl + video + '.xml'
2929             try:
2930                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2931             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2932                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2933                 return
2934             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2935             try:
2936                 info['title'] = mdoc.findall('./title')[0].text
2937                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2938             except IndexError:
2939                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2940                 return
2941             info['ext'] = info['url'].rpartition('.')[2]
2942             return [info]
2943         elif mobj.group('course'): # A course page
2944             course = mobj.group('course')
2945             info = {
2946                 'id': course,
2947                 'type': 'playlist',
2948                 'uploader': None,
2949                 'upload_date': None,
2950             }
2951
2952             coursepage = self._download_webpage(url, info['id'],
2953                                         note='Downloading course info page',
2954                                         errnote='Unable to download course info page')
2955
2956             m = re.search('<h1>([^<]+)</h1>', coursepage)
2957             if m:
2958                 info['title'] = unescapeHTML(m.group(1))
2959             else:
2960                 info['title'] = info['id']
2961
2962             m = re.search('<description>([^<]+)</description>', coursepage)
2963             if m:
2964                 info['description'] = unescapeHTML(m.group(1))
2965
2966             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2967             info['list'] = [
2968                 {
2969                     'type': 'reference',
2970                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2971                 }
2972                     for vpage in links]
2973             results = []
2974             for entry in info['list']:
2975                 assert entry['type'] == 'reference'
2976                 results += self.extract(entry['url'])
2977             return results
2978         else: # Root page
2979             info = {
2980                 'id': 'Stanford OpenClassroom',
2981                 'type': 'playlist',
2982                 'uploader': None,
2983                 'upload_date': None,
2984             }
2985
2986             self.report_download_webpage(info['id'])
2987             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2988             try:
2989                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2990             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2991                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2992                 return
2993
2994             info['title'] = info['id']
2995
2996             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2997             info['list'] = [
2998                 {
2999                     'type': 'reference',
3000                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3001                 }
3002                     for cpage in links]
3003
3004             results = []
3005             for entry in info['list']:
3006                 assert entry['type'] == 'reference'
3007                 results += self.extract(entry['url'])
3008             return results
3009
3010 class MTVIE(InfoExtractor):
3011     """Information extractor for MTV.com"""
3012
3013     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3014     IE_NAME = u'mtv'
3015
3016     def report_extraction(self, video_id):
3017         """Report information extraction."""
3018         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3019
3020     def _real_extract(self, url):
3021         mobj = re.match(self._VALID_URL, url)
3022         if mobj is None:
3023             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3024             return
3025         if not mobj.group('proto'):
3026             url = 'http://' + url
3027         video_id = mobj.group('videoid')
3028
3029         webpage = self._download_webpage(url, video_id)
3030
3031         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3032         if mobj is None:
3033             self._downloader.trouble(u'ERROR: unable to extract song name')
3034             return
3035         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3036         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3037         if mobj is None:
3038             self._downloader.trouble(u'ERROR: unable to extract performer')
3039             return
3040         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3041         video_title = performer + ' - ' + song_name
3042
3043         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3044         if mobj is None:
3045             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3046             return
3047         mtvn_uri = mobj.group(1)
3048
3049         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3050         if mobj is None:
3051             self._downloader.trouble(u'ERROR: unable to extract content id')
3052             return
3053         content_id = mobj.group(1)
3054
3055         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3056         self.report_extraction(video_id)
3057         request = compat_urllib_request.Request(videogen_url)
3058         try:
3059             metadataXml = compat_urllib_request.urlopen(request).read()
3060         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3062             return
3063
3064         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3065         renditions = mdoc.findall('.//rendition')
3066
3067         # For now, always pick the highest quality.
3068         rendition = renditions[-1]
3069
3070         try:
3071             _,_,ext = rendition.attrib['type'].partition('/')
3072             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3073             video_url = rendition.find('./src').text
3074         except KeyError:
3075             self._downloader.trouble('Invalid rendition field.')
3076             return
3077
3078         info = {
3079             'id': video_id,
3080             'url': video_url,
3081             'uploader': performer,
3082             'upload_date': None,
3083             'title': video_title,
3084             'ext': ext,
3085             'format': format,
3086         }
3087
3088         return [info]
3089
3090
3091 class YoukuIE(InfoExtractor):
3092     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3093
3094     def report_download_webpage(self, file_id):
3095         """Report webpage download."""
3096         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3097
3098     def report_extraction(self, file_id):
3099         """Report information extraction."""
3100         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3101
3102     def _gen_sid(self):
3103         nowTime = int(time.time() * 1000)
3104         random1 = random.randint(1000,1998)
3105         random2 = random.randint(1000,9999)
3106
3107         return "%d%d%d" %(nowTime,random1,random2)
3108
3109     def _get_file_ID_mix_string(self, seed):
3110         mixed = []
3111         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3112         seed = float(seed)
3113         for i in range(len(source)):
3114             seed  =  (seed * 211 + 30031 ) % 65536
3115             index  =  math.floor(seed / 65536 * len(source) )
3116             mixed.append(source[int(index)])
3117             source.remove(source[int(index)])
3118         #return ''.join(mixed)
3119         return mixed
3120
3121     def _get_file_id(self, fileId, seed):
3122         mixed = self._get_file_ID_mix_string(seed)
3123         ids = fileId.split('*')
3124         realId = []
3125         for ch in ids:
3126             if ch:
3127                 realId.append(mixed[int(ch)])
3128         return ''.join(realId)
3129
3130     def _real_extract(self, url):
3131         mobj = re.match(self._VALID_URL, url)
3132         if mobj is None:
3133             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3134             return
3135         video_id = mobj.group('ID')
3136
3137         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3138
3139         request = compat_urllib_request.Request(info_url, None, std_headers)
3140         try:
3141             self.report_download_webpage(video_id)
3142             jsondata = compat_urllib_request.urlopen(request).read()
3143         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3144             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3145             return
3146
3147         self.report_extraction(video_id)
3148         try:
3149             jsonstr = jsondata.decode('utf-8')
3150             config = json.loads(jsonstr)
3151
3152             video_title =  config['data'][0]['title']
3153             seed = config['data'][0]['seed']
3154
3155             format = self._downloader.params.get('format', None)
3156             supported_format = list(config['data'][0]['streamfileids'].keys())
3157
3158             if format is None or format == 'best':
3159                 if 'hd2' in supported_format:
3160                     format = 'hd2'
3161                 else:
3162                     format = 'flv'
3163                 ext = u'flv'
3164             elif format == 'worst':
3165                 format = 'mp4'
3166                 ext = u'mp4'
3167             else:
3168                 format = 'flv'
3169                 ext = u'flv'
3170
3171
3172             fileid = config['data'][0]['streamfileids'][format]
3173             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3174         except (UnicodeDecodeError, ValueError, KeyError):
3175             self._downloader.trouble(u'ERROR: unable to extract info section')
3176             return
3177
3178         files_info=[]
3179         sid = self._gen_sid()
3180         fileid = self._get_file_id(fileid, seed)
3181
3182         #column 8,9 of fileid represent the segment number
3183         #fileid[7:9] should be changed
3184         for index, key in enumerate(keys):
3185
3186             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3187             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3188
3189             info = {
3190                 'id': '%s_part%02d' % (video_id, index),
3191                 'url': download_url,
3192                 'uploader': None,
3193                 'upload_date': None,
3194                 'title': video_title,
3195                 'ext': ext,
3196             }
3197             files_info.append(info)
3198
3199         return files_info
3200
3201
3202 class XNXXIE(InfoExtractor):
3203     """Information extractor for xnxx.com"""
3204
3205     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3206     IE_NAME = u'xnxx'
3207     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3208     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3209     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3210
3211     def report_webpage(self, video_id):
3212         """Report information extraction"""
3213         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3214
3215     def report_extraction(self, video_id):
3216         """Report information extraction"""
3217         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3218
3219     def _real_extract(self, url):
3220         mobj = re.match(self._VALID_URL, url)
3221         if mobj is None:
3222             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3223             return
3224         video_id = mobj.group(1)
3225
3226         self.report_webpage(video_id)
3227
3228         # Get webpage content
3229         try:
3230             webpage_bytes = compat_urllib_request.urlopen(url).read()
3231             webpage = webpage_bytes.decode('utf-8')
3232         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3233             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3234             return
3235
3236         result = re.search(self.VIDEO_URL_RE, webpage)
3237         if result is None:
3238             self._downloader.trouble(u'ERROR: unable to extract video url')
3239             return
3240         video_url = compat_urllib_parse.unquote(result.group(1))
3241
3242         result = re.search(self.VIDEO_TITLE_RE, webpage)
3243         if result is None:
3244             self._downloader.trouble(u'ERROR: unable to extract video title')
3245             return
3246         video_title = result.group(1)
3247
3248         result = re.search(self.VIDEO_THUMB_RE, webpage)
3249         if result is None:
3250             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3251             return
3252         video_thumbnail = result.group(1)
3253
3254         return [{
3255             'id': video_id,
3256             'url': video_url,
3257             'uploader': None,
3258             'upload_date': None,
3259             'title': video_title,
3260             'ext': 'flv',
3261             'thumbnail': video_thumbnail,
3262             'description': None,
3263         }]
3264
3265
3266 class GooglePlusIE(InfoExtractor):
3267     """Information extractor for plus.google.com."""
3268
3269     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3270     IE_NAME = u'plus.google'
3271
3272     def __init__(self, downloader=None):
3273         InfoExtractor.__init__(self, downloader)
3274
3275     def report_extract_entry(self, url):
3276         """Report downloading extry"""
3277         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3278
3279     def report_date(self, upload_date):
3280         """Report downloading extry"""
3281         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3282
3283     def report_uploader(self, uploader):
3284         """Report downloading extry"""
3285         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3286
3287     def report_title(self, video_title):
3288         """Report downloading extry"""
3289         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3290
3291     def report_extract_vid_page(self, video_page):
3292         """Report information extraction."""
3293         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3294
3295     def _real_extract(self, url):
3296         # Extract id from URL
3297         mobj = re.match(self._VALID_URL, url)
3298         if mobj is None:
3299             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3300             return
3301
3302         post_url = mobj.group(0)
3303         video_id = mobj.group(1)
3304
3305         video_extension = 'flv'
3306
3307         # Step 1, Retrieve post webpage to extract further information
3308         self.report_extract_entry(post_url)
3309         request = compat_urllib_request.Request(post_url)
3310         try:
3311             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3312         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3313             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3314             return
3315
3316         # Extract update date
3317         upload_date = None
3318         pattern = 'title="Timestamp">(.*?)</a>'
3319         mobj = re.search(pattern, webpage)
3320         if mobj:
3321             upload_date = mobj.group(1)
3322             # Convert timestring to a format suitable for filename
3323             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3324             upload_date = upload_date.strftime('%Y%m%d')
3325         self.report_date(upload_date)
3326
3327         # Extract uploader
3328         uploader = None
3329         pattern = r'rel\="author".*?>(.*?)</a>'
3330         mobj = re.search(pattern, webpage)
3331         if mobj:
3332             uploader = mobj.group(1)
3333         self.report_uploader(uploader)
3334
3335         # Extract title
3336         # Get the first line for title
3337         video_title = u'NA'
3338         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3339         mobj = re.search(pattern, webpage)
3340         if mobj:
3341             video_title = mobj.group(1)
3342         self.report_title(video_title)
3343
3344         # Step 2, Stimulate clicking the image box to launch video
3345         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3346         mobj = re.search(pattern, webpage)
3347         if mobj is None:
3348             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3349
3350         video_page = mobj.group(1)
3351         request = compat_urllib_request.Request(video_page)
3352         try:
3353             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3355             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3356             return
3357         self.report_extract_vid_page(video_page)
3358
3359
3360         # Extract video links on video page
3361         """Extract video links of all sizes"""
3362         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3363         mobj = re.findall(pattern, webpage)
3364         if len(mobj) == 0:
3365             self._downloader.trouble(u'ERROR: unable to extract video links')
3366
3367         # Sort in resolution
3368         links = sorted(mobj)
3369
3370         # Choose the lowest of the sort, i.e. highest resolution
3371         video_url = links[-1]
3372         # Only get the url. The resolution part in the tuple has no use anymore
3373         video_url = video_url[-1]
3374         # Treat escaped \u0026 style hex
3375         try:
3376             video_url = video_url.decode("unicode_escape")
3377         except AttributeError: # Python 3
3378             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3379
3380
3381         return [{
3382             'id':       video_id,
3383             'url':      video_url,
3384             'uploader': uploader,
3385             'upload_date':  upload_date,
3386             'title':    video_title,
3387             'ext':      video_extension,
3388         }]
3389
3390 class NBAIE(InfoExtractor):
3391     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3392     IE_NAME = u'nba'
3393
3394     def _real_extract(self, url):
3395         mobj = re.match(self._VALID_URL, url)
3396         if mobj is None:
3397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3398             return
3399
3400         video_id = mobj.group(1)
3401         if video_id.endswith('/index.html'):
3402             video_id = video_id[:-len('/index.html')]
3403
3404         webpage = self._download_webpage(url, video_id)
3405
3406         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3407         def _findProp(rexp, default=None):
3408             m = re.search(rexp, webpage)
3409             if m:
3410                 return unescapeHTML(m.group(1))
3411             else:
3412                 return default
3413
3414         shortened_video_id = video_id.rpartition('/')[2]
3415         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3416         info = {
3417             'id': shortened_video_id,
3418             'url': video_url,
3419             'ext': 'mp4',
3420             'title': title,
3421             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3422             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3423         }
3424         return [info]
3425
3426 class JustinTVIE(InfoExtractor):
3427     """Information extractor for justin.tv and twitch.tv"""
3428     # TODO: One broadcast may be split into multiple videos. The key
3429     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3430     # starts at 1 and increases. Can we treat all parts as one video?
3431
3432     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3433         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3434     _JUSTIN_PAGE_LIMIT = 100
3435     IE_NAME = u'justin.tv'
3436
3437     def report_extraction(self, file_id):
3438         """Report information extraction."""
3439         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3440
3441     def report_download_page(self, channel, offset):
3442         """Report attempt to download a single page of videos."""
3443         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3444                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3445
3446     # Return count of items, list of *valid* items
3447     def _parse_page(self, url):
3448         try:
3449             urlh = compat_urllib_request.urlopen(url)
3450             webpage_bytes = urlh.read()
3451             webpage = webpage_bytes.decode('utf-8', 'ignore')
3452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3453             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3454             return
3455
3456         response = json.loads(webpage)
3457         if type(response) != list:
3458             error_text = response.get('error', 'unknown error')
3459             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3460             return
3461         info = []
3462         for clip in response:
3463             video_url = clip['video_file_url']
3464             if video_url:
3465                 video_extension = os.path.splitext(video_url)[1][1:]
3466                 video_date = re.sub('-', '', clip['start_time'][:10])
3467                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3468                 video_id = clip['id']
3469                 video_title = clip.get('title', video_id)
3470                 info.append({
3471                     'id': video_id,
3472                     'url': video_url,
3473                     'title': video_title,
3474                     'uploader': clip.get('channel_name', video_uploader_id),
3475                     'uploader_id': video_uploader_id,
3476                     'upload_date': video_date,
3477                     'ext': video_extension,
3478                 })
3479         return (len(response), info)
3480
3481     def _real_extract(self, url):
3482         mobj = re.match(self._VALID_URL, url)
3483         if mobj is None:
3484             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3485             return
3486
3487         api = 'http://api.justin.tv'
3488         video_id = mobj.group(mobj.lastindex)
3489         paged = False
3490         if mobj.lastindex == 1:
3491             paged = True
3492             api += '/channel/archives/%s.json'
3493         else:
3494             api += '/broadcast/by_archive/%s.json'
3495         api = api % (video_id,)
3496
3497         self.report_extraction(video_id)
3498
3499         info = []
3500         offset = 0
3501         limit = self._JUSTIN_PAGE_LIMIT
3502         while True:
3503             if paged:
3504                 self.report_download_page(video_id, offset)
3505             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3506             page_count, page_info = self._parse_page(page_url)
3507             info.extend(page_info)
3508             if not paged or page_count != limit:
3509                 break
3510             offset += limit
3511         return info
3512
3513 class FunnyOrDieIE(InfoExtractor):
3514     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3515
3516     def _real_extract(self, url):
3517         mobj = re.match(self._VALID_URL, url)
3518         if mobj is None:
3519             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3520             return
3521
3522         video_id = mobj.group('id')
3523         webpage = self._download_webpage(url, video_id)
3524
3525         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3526         if not m:
3527             self._downloader.trouble(u'ERROR: unable to find video information')
3528         video_url = unescapeHTML(m.group('url'))
3529
3530         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3531         if not m:
3532             self._downloader.trouble(u'Cannot find video title')
3533         title = unescapeHTML(m.group('title'))
3534
3535         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3536         if m:
3537             desc = unescapeHTML(m.group('desc'))
3538         else:
3539             desc = None
3540
3541         info = {
3542             'id': video_id,
3543             'url': video_url,
3544             'ext': 'mp4',
3545             'title': title,
3546             'description': desc,
3547         }
3548         return [info]
3549
3550 class TweetReelIE(InfoExtractor):
3551     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3552
3553     def _real_extract(self, url):
3554         mobj = re.match(self._VALID_URL, url)
3555         if mobj is None:
3556             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3557             return
3558
3559         video_id = mobj.group('id')
3560         webpage = self._download_webpage(url, video_id)
3561
3562         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3563         if not m:
3564             self._downloader.trouble(u'ERROR: Cannot find status ID')
3565         status_id = m.group(1)
3566
3567         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3568         if not m:
3569             self._downloader.trouble(u'WARNING: Cannot find description')
3570         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3571
3572         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3573         if not m:
3574             self._downloader.trouble(u'ERROR: Cannot find uploader')
3575         uploader = unescapeHTML(m.group('uploader'))
3576         uploader_id = unescapeHTML(m.group('uploader_id'))
3577
3578         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3579         if not m:
3580             self._downloader.trouble(u'ERROR: Cannot find upload date')
3581         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3582
3583         title = desc
3584         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3585
3586         info = {
3587             'id': video_id,
3588             'url': video_url,
3589             'ext': 'mov',
3590             'title': title,
3591             'description': desc,
3592             'uploader': uploader,
3593             'uploader_id': uploader_id,
3594             'internal_id': status_id,
3595             'upload_date': upload_date
3596         }
3597         return [info]
3598         
3599 class SteamIE(InfoExtractor):
3600     _VALID_URL = r"""http://store.steampowered.com/ 
3601                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3602                 (?P<gameID>\d+)/?
3603                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3604                 """
3605
3606     def suitable(self, url):
3607         """Receives a URL and returns True if suitable for this IE."""
3608         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3609
3610     def _real_extract(self, url):
3611         m = re.match(self._VALID_URL, url, re.VERBOSE)
3612         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3613         gameID = m.group('gameID')
3614         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3615         webpage = self._download_webpage(videourl, gameID)
3616         mweb = re.finditer(urlRE, webpage)
3617         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3618         titles = re.finditer(namesRE, webpage)
3619         videos = []
3620         for vid,vtitle in zip(mweb,titles):
3621             video_id = vid.group('videoID')
3622             title = vtitle.group('videoName')
3623             video_url = vid.group('videoURL')
3624             if not video_url:
3625                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3626             info = {
3627                 'id':video_id,
3628                 'url':video_url,
3629                 'ext': 'flv',
3630                 'title': unescapeHTML(title)
3631                   }
3632             videos.append(info)
3633         return videos
3634
3635 class UstreamIE(InfoExtractor):
3636     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3637     IE_NAME = u'ustream'
3638
3639     def _real_extract(self, url):
3640         m = re.match(self._VALID_URL, url)
3641         video_id = m.group('videoID')
3642         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3643         webpage = self._download_webpage(url, video_id)
3644         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3645         title = m.group('title')
3646         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3647         uploader = m.group('uploader')
3648         info = {
3649                 'id':video_id,
3650                 'url':video_url,
3651                 'ext': 'flv',
3652                 'title': title,
3653                 'uploader': uploader
3654                   }
3655         return [info]
3656
3657 class RBMARadioIE(InfoExtractor):
3658     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3659
3660     def _real_extract(self, url):
3661         m = re.match(self._VALID_URL, url)
3662         video_id = m.group('videoID')
3663
3664         webpage = self._download_webpage(url, video_id)
3665         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3666         if not m:
3667             raise ExtractorError(u'Cannot find metadata')
3668         json_data = m.group(1)
3669
3670         try:
3671             data = json.loads(json_data)
3672         except ValueError as e:
3673             raise ExtractorError(u'Invalid JSON: ' + str(e))
3674
3675         video_url = data['akamai_url'] + '&cbr=256'
3676         url_parts = compat_urllib_parse_urlparse(video_url)
3677         video_ext = url_parts.path.rpartition('.')[2]
3678         info = {
3679                 'id': video_id,
3680                 'url': video_url,
3681                 'ext': video_ext,
3682                 'title': data['title'],
3683                 'description': data.get('teaser_text'),
3684                 'location': data.get('country_of_origin'),
3685                 'uploader': data.get('host', {}).get('name'),
3686                 'uploader_id': data.get('host', {}).get('slug'),
3687                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3688                 'duration': data.get('duration'),
3689         }
3690         return [info]
3691
3692
3693 class YouPornIE(InfoExtractor):
3694     """Information extractor for youporn.com."""
3695     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3696    
3697     def _print_formats(self, formats):
3698         """Print all available formats"""
3699         print(u'Available formats:')
3700         print(u'ext\t\tformat')
3701         print(u'---------------------------------')
3702         for format in formats:
3703             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3704
3705     def _specific(self, req_format, formats):
3706         for x in formats:
3707             if(x["format"]==req_format):
3708                 return x
3709         return None
3710
3711     def _real_extract(self, url):
3712         mobj = re.match(self._VALID_URL, url)
3713         if mobj is None:
3714             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3715             return
3716
3717         video_id = mobj.group('videoid')
3718
3719         req = compat_urllib_request.Request(url)
3720         req.add_header('Cookie', 'age_verified=1')
3721         webpage = self._download_webpage(req, video_id)
3722
3723         # Get the video title
3724         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3725         if result is None:
3726             raise ExtractorError(u'Unable to extract video title')
3727         video_title = result.group('title').strip()
3728
3729         # Get the video date
3730         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3731         if result is None:
3732             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3733             upload_date = None
3734         else:
3735             upload_date = result.group('date').strip()
3736
3737         # Get the video uploader
3738         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3739         if result is None:
3740             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3741             video_uploader = None
3742         else:
3743             video_uploader = result.group('uploader').strip()
3744             video_uploader = clean_html( video_uploader )
3745
3746         # Get all of the formats available
3747         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3748         result = re.search(DOWNLOAD_LIST_RE, webpage)
3749         if result is None:
3750             raise ExtractorError(u'Unable to extract download list')
3751         download_list_html = result.group('download_list').strip()
3752
3753         # Get all of the links from the page
3754         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3755         links = re.findall(LINK_RE, download_list_html)
3756         if(len(links) == 0):
3757             raise ExtractorError(u'ERROR: no known formats available for video')
3758         
3759         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3760
3761         formats = []
3762         for link in links:
3763
3764             # A link looks like this:
3765             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3766             # A path looks like this:
3767             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3768             video_url = unescapeHTML( link )
3769             path = compat_urllib_parse_urlparse( video_url ).path
3770             extension = os.path.splitext( path )[1][1:]
3771             format = path.split('/')[4].split('_')[:2]
3772             size = format[0]
3773             bitrate = format[1]
3774             format = "-".join( format )
3775             title = u'%s-%s-%s' % (video_title, size, bitrate)
3776
3777             formats.append({
3778                 'id': video_id,
3779                 'url': video_url,
3780                 'uploader': video_uploader,
3781                 'upload_date': upload_date,
3782                 'title': title,
3783                 'ext': extension,
3784                 'format': format,
3785                 'thumbnail': None,
3786                 'description': None,
3787                 'player_url': None
3788             })
3789
3790         if self._downloader.params.get('listformats', None):
3791             self._print_formats(formats)
3792             return
3793
3794         req_format = self._downloader.params.get('format', None)
3795         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3796
3797         if req_format is None or req_format == 'best':
3798             return [formats[0]]
3799         elif req_format == 'worst':
3800             return [formats[-1]]
3801         elif req_format in ('-1', 'all'):
3802             return formats
3803         else:
3804             format = self._specific( req_format, formats )
3805             if result is None:
3806                 self._downloader.trouble(u'ERROR: requested format not available')
3807                 return
3808             return [format]
3809
3810         
3811
3812 class PornotubeIE(InfoExtractor):
3813     """Information extractor for pornotube.com."""
3814     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3815
3816     def _real_extract(self, url):
3817         mobj = re.match(self._VALID_URL, url)
3818         if mobj is None:
3819             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3820             return
3821
3822         video_id = mobj.group('videoid')
3823         video_title = mobj.group('title')
3824
3825         # Get webpage content
3826         webpage = self._download_webpage(url, video_id)
3827
3828         # Get the video URL
3829         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3830         result = re.search(VIDEO_URL_RE, webpage)
3831         if result is None:
3832             self._downloader.trouble(u'ERROR: unable to extract video url')
3833             return
3834         video_url = compat_urllib_parse.unquote(result.group('url'))
3835
3836         #Get the uploaded date
3837         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3838         result = re.search(VIDEO_UPLOADED_RE, webpage)
3839         if result is None:
3840             self._downloader.trouble(u'ERROR: unable to extract video title')
3841             return
3842         upload_date = result.group('date')
3843
3844         info = {'id': video_id,
3845                 'url': video_url,
3846                 'uploader': None,
3847                 'upload_date': upload_date,
3848                 'title': video_title,
3849                 'ext': 'flv',
3850                 'format': 'flv'}
3851
3852         return [info]
3853
3854 class YouJizzIE(InfoExtractor):
3855     """Information extractor for youjizz.com."""
3856     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3857
3858     def _real_extract(self, url):
3859         mobj = re.match(self._VALID_URL, url)
3860         if mobj is None:
3861             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3862             return
3863
3864         video_id = mobj.group('videoid')
3865
3866         # Get webpage content
3867         webpage = self._download_webpage(url, video_id)
3868
3869         # Get the video title
3870         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3871         if result is None:
3872             raise ExtractorError(u'ERROR: unable to extract video title')
3873         video_title = result.group('title').strip()
3874
3875         # Get the embed page
3876         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3877         if result is None:
3878             raise ExtractorError(u'ERROR: unable to extract embed page')
3879
3880         embed_page_url = result.group(0).strip()
3881         video_id = result.group('videoid')
3882     
3883         webpage = self._download_webpage(embed_page_url, video_id)
3884
3885         # Get the video URL
3886         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3887         if result is None:
3888             raise ExtractorError(u'ERROR: unable to extract video url')
3889         video_url = result.group('source')
3890
3891         info = {'id': video_id,
3892                 'url': video_url,
3893                 'title': video_title,
3894                 'ext': 'flv',
3895                 'format': 'flv',
3896                 'player_url': embed_page_url}
3897
3898         return [info]
3899
3900 class EightTracksIE(InfoExtractor):
3901     IE_NAME = '8tracks'
3902     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3903
3904     def _real_extract(self, url):
3905         mobj = re.match(self._VALID_URL, url)
3906         if mobj is None:
3907             raise ExtractorError(u'Invalid URL: %s' % url)
3908         playlist_id = mobj.group('id')
3909
3910         webpage = self._download_webpage(url, playlist_id)
3911
3912         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3913         if not m:
3914             raise ExtractorError(u'Cannot find trax information')
3915         json_like = m.group(1)
3916         data = json.loads(json_like)
3917
3918         session = str(random.randint(0, 1000000000))
3919         mix_id = data['id']
3920         track_count = data['tracks_count']
3921         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3922         next_url = first_url
3923         res = []
3924         for i in itertools.count():
3925             api_json = self._download_webpage(next_url, playlist_id,
3926                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3927                 errnote=u'Failed to download song information')
3928             api_data = json.loads(api_json)
3929             track_data = api_data[u'set']['track']
3930             info = {
3931                 'id': track_data['id'],
3932                 'url': track_data['track_file_stream_url'],
3933                 'title': track_data['performer'] + u' - ' + track_data['name'],
3934                 'raw_title': track_data['name'],
3935                 'uploader_id': data['user']['login'],
3936                 'ext': 'm4a',
3937             }
3938             res.append(info)
3939             if api_data['set']['at_last_track']:
3940                 break
3941             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3942         return res
3943
3944 class KeekIE(InfoExtractor):
3945     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3946     IE_NAME = u'keek'
3947
3948     def _real_extract(self, url):
3949         m = re.match(self._VALID_URL, url)
3950         video_id = m.group('videoID')
3951         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3952         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3953         webpage = self._download_webpage(url, video_id)
3954         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3955         title = unescapeHTML(m.group('title'))
3956         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3957         uploader = unescapeHTML(m.group('uploader'))
3958         info = {
3959                 'id':video_id,
3960                 'url':video_url,
3961                 'ext': 'mp4',
3962                 'title': title,
3963                 'thumbnail': thumbnail,
3964                 'uploader': uploader
3965         }
3966         return [info]
3967
3968 class TEDIE(InfoExtractor):
3969     _VALID_URL=r'''http://www.ted.com/
3970                    (
3971                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3972                         |
3973                         ((?P<type_talk>talks)) # We have a simple talk
3974                    )
3975                    /(?P<name>\w+) # Here goes the name and then ".html"
3976                    '''
3977
3978     def suitable(self, url):
3979         """Receives a URL and returns True if suitable for this IE."""
3980         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3981
3982     def _real_extract(self, url):
3983         m=re.match(self._VALID_URL, url, re.VERBOSE)
3984         if m.group('type_talk'):
3985             return [self._talk_info(url)]
3986         else :
3987             playlist_id=m.group('playlist_id')
3988             name=m.group('name')
3989             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3990             return self._playlist_videos_info(url,name,playlist_id)
3991
3992     def _talk_video_link(self,mediaSlug):
3993         '''Returns the video link for that mediaSlug'''
3994         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3995
3996     def _playlist_videos_info(self,url,name,playlist_id=0):
3997         '''Returns the videos of the playlist'''
3998         video_RE=r'''
3999                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4000                      ([.\s]*?)data-playlist_item_id="(\d+)"
4001                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4002                      '''
4003         video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4004         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4005         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4006         m_names=re.finditer(video_name_RE,webpage)
4007         info=[]
4008         for m_video, m_name in zip(m_videos,m_names):
4009             video_dic={
4010                        'id': m_video.group('video_id'),
4011                        'url': self._talk_video_link(m_video.group('mediaSlug')),
4012                        'ext': 'mp4',
4013                        'title': m_name.group('fullname')
4014                        }
4015             info.append(video_dic)
4016         return info
4017     def _talk_info(self, url, video_id=0):
4018         """Return the video for the talk in the url"""
4019         m=re.match(self._VALID_URL, url,re.VERBOSE)
4020         videoName=m.group('name')
4021         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4022         # If the url includes the language we get the title translated
4023         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4024         title=re.search(title_RE, webpage).group('title')
4025         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4026                         "id":(?P<videoID>[\d]+).*?
4027                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4028         info_match=re.search(info_RE,webpage,re.VERBOSE)
4029         video_id=info_match.group('videoID')
4030         mediaSlug=info_match.group('mediaSlug')
4031         video_url=self._talk_video_link(mediaSlug)
4032         info = {
4033                 'id': video_id,
4034                 'url': video_url,
4035                 'ext': 'mp4',
4036                 'title': title
4037                 }
4038         return info
4039
4040 class MySpassIE(InfoExtractor):
4041     _VALID_URL = r'http://www.myspass.de/.*'
4042     
4043     def _real_extract(self, url):
4044         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4045
4046         # video id is the last path element of the URL
4047         # usually there is a trailing slash, so also try the second but last
4048         url_path = compat_urllib_parse_urlparse(url).path
4049         url_parent_path, video_id = os.path.split(url_path)
4050         if not video_id:
4051             _, video_id = os.path.split(url_parent_path)
4052         
4053         # get metadata
4054         metadata_url = META_DATA_URL_TEMPLATE % video_id
4055         metadata_text = self._download_webpage(metadata_url, video_id)
4056         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4057         
4058         # extract values from metadata
4059         url_flv_el = metadata.find('url_flv')
4060         if url_flv_el is None:
4061             self._downloader.trouble(u'ERROR: unable to extract download url')
4062             return
4063         video_url = url_flv_el.text
4064         extension = os.path.splitext(video_url)[1][1:]
4065         title_el = metadata.find('title')
4066         if title_el is None:
4067             self._downloader.trouble(u'ERROR: unable to extract title')
4068             return
4069         title = title_el.text
4070         format_id_el = metadata.find('format_id')
4071         if format_id_el is None:
4072             format = ext
4073         else:
4074             format = format_id_el.text
4075         description_el = metadata.find('description')
4076         if description_el is not None:
4077             description = description_el.text
4078         else:
4079             description = None
4080         imagePreview_el = metadata.find('imagePreview')
4081         if imagePreview_el is not None:
4082             thumbnail = imagePreview_el.text
4083         else:
4084             thumbnail = None
4085         info = {
4086             'id': video_id,
4087             'url': video_url,
4088             'title': title,
4089             'ext': extension,
4090             'format': format,
4091             'thumbnail': thumbnail,
4092             'description': description
4093         }
4094         return [info]
4095
4096 def gen_extractors():
4097     """ Return a list of an instance of every supported extractor.
4098     The order does matter; the first extractor matched is the one handling the URL.
4099     """
4100     return [
4101         YoutubePlaylistIE(),
4102         YoutubeChannelIE(),
4103         YoutubeUserIE(),
4104         YoutubeSearchIE(),
4105         YoutubeIE(),
4106         MetacafeIE(),
4107         DailymotionIE(),
4108         GoogleSearchIE(),
4109         PhotobucketIE(),
4110         YahooIE(),
4111         YahooSearchIE(),
4112         DepositFilesIE(),
4113         FacebookIE(),
4114         BlipTVUserIE(),
4115         BlipTVIE(),
4116         VimeoIE(),
4117         MyVideoIE(),
4118         ComedyCentralIE(),
4119         EscapistIE(),
4120         CollegeHumorIE(),
4121         XVideosIE(),
4122         SoundcloudIE(),
4123         InfoQIE(),
4124         MixcloudIE(),
4125         StanfordOpenClassroomIE(),
4126         MTVIE(),
4127         YoukuIE(),
4128         XNXXIE(),
4129         YouJizzIE(),
4130         PornotubeIE(),
4131         YouPornIE(),
4132         GooglePlusIE(),
4133         ArteTvIE(),
4134         NBAIE(),
4135         JustinTVIE(),
4136         FunnyOrDieIE(),
4137         TweetReelIE(),
4138         SteamIE(),
4139         UstreamIE(),
4140         RBMARadioIE(),
4141         EightTracksIE(),
4142         KeekIE(),
4143         TEDIE(),
4144         MySpassIE(),
4145         GenericIE()
4146     ]
4147
4148