b7371365ac7e431808ca9927beb70c4e0b0e1635
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137         
138     #Methods for following #608
139     #They set the correct value of the '_type' key
140     def video_result(self, video_info):
141         """Returns a video"""
142         video_info['_type'] = 'video'
143         return video_info
144     def url_result(self, url, ie=None):
145         """Returns a url that points to a page that should be processed"""
146         #TODO: ie should be the class used for getting the info
147         video_info = {'_type': 'url',
148                       'url': url}
149         return video_info
150     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151         """Returns a playlist"""
152         video_info = {'_type': 'playlist',
153                       'entries': entries}
154         if playlist_id:
155             video_info['id'] = playlist_id
156         if playlist_title:
157             video_info['title'] = playlist_title
158         return video_info
159
160
161 class YoutubeIE(InfoExtractor):
162     """Information extractor for youtube.com."""
163
164     _VALID_URL = r"""^
165                      (
166                          (?:https?://)?                                       # http(s):// (optional)
167                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
169                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
170                          (?:                                                  # the various things that can precede the ID:
171                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
172                              |(?:                                             # or the v= param in all its forms
173                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
175                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
176                                  v=
177                              )
178                          )?                                                   # optional -> youtube.com/xxxx is OK
179                      )?                                                       # all until now is optional -> you can pass the naked ID
180                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
181                      (?(1).+)?                                                # if we found the ID, everything can follow
182                      $"""
183     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187     _NETRC_MACHINE = 'youtube'
188     # Listed in order of quality
189     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191     _video_extensions = {
192         '13': '3gp',
193         '17': 'mp4',
194         '18': 'mp4',
195         '22': 'mp4',
196         '37': 'mp4',
197         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
198         '43': 'webm',
199         '44': 'webm',
200         '45': 'webm',
201         '46': 'webm',
202     }
203     _video_dimensions = {
204         '5': '240x400',
205         '6': '???',
206         '13': '???',
207         '17': '144x176',
208         '18': '360x640',
209         '22': '720x1280',
210         '34': '360x640',
211         '35': '480x854',
212         '37': '1080x1920',
213         '38': '3072x4096',
214         '43': '360x640',
215         '44': '480x854',
216         '45': '720x1280',
217         '46': '1080x1920',
218     }
219     IE_NAME = u'youtube'
220
221     @classmethod
222     def suitable(cls, url):
223         """Receives a URL and returns True if suitable for this IE."""
224         if YoutubePlaylistIE.suitable(url): return False
225         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
226
227     def report_lang(self):
228         """Report attempt to set language."""
229         self._downloader.to_screen(u'[youtube] Setting language')
230
231     def report_login(self):
232         """Report attempt to log in."""
233         self._downloader.to_screen(u'[youtube] Logging in')
234
235     def report_age_confirmation(self):
236         """Report attempt to confirm age."""
237         self._downloader.to_screen(u'[youtube] Confirming age')
238
239     def report_video_webpage_download(self, video_id):
240         """Report attempt to download video webpage."""
241         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
242
243     def report_video_info_webpage_download(self, video_id):
244         """Report attempt to download video info webpage."""
245         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
246
247     def report_video_subtitles_download(self, video_id):
248         """Report attempt to download video info webpage."""
249         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
250
251     def report_video_subtitles_request(self, video_id, sub_lang, format):
252         """Report attempt to download video info webpage."""
253         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
254
255     def report_video_subtitles_available(self, video_id, sub_lang_list):
256         """Report available subtitles."""
257         sub_lang = ",".join(list(sub_lang_list.keys()))
258         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
259
260     def report_information_extraction(self, video_id):
261         """Report attempt to extract video information."""
262         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
263
264     def report_unavailable_format(self, video_id, format):
265         """Report extracted video URL."""
266         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
267
268     def report_rtmp_download(self):
269         """Indicate the download will use the RTMP protocol."""
270         self._downloader.to_screen(u'[youtube] RTMP download detected')
271
272     def _get_available_subtitles(self, video_id):
273         self.report_video_subtitles_download(video_id)
274         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
275         try:
276             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278             return (u'unable to download video subtitles: %s' % compat_str(err), None)
279         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281         if not sub_lang_list:
282             return (u'video doesn\'t have subtitles', None)
283         return sub_lang_list
284
285     def _list_available_subtitles(self, video_id):
286         sub_lang_list = self._get_available_subtitles(video_id)
287         self.report_video_subtitles_available(video_id, sub_lang_list)
288
289     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
290         """
291         Return tuple:
292         (error_message, sub_lang, sub)
293         """
294         self.report_video_subtitles_request(video_id, sub_lang, format)
295         params = compat_urllib_parse.urlencode({
296             'lang': sub_lang,
297             'name': sub_name,
298             'v': video_id,
299             'fmt': format,
300         })
301         url = 'http://www.youtube.com/api/timedtext?' + params
302         try:
303             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
306         if not sub:
307             return (u'Did not fetch video subtitles', None, None)
308         return (None, sub_lang, sub)
309
310     def _extract_subtitle(self, video_id):
311         """
312         Return a list with a tuple:
313         [(error_message, sub_lang, sub)]
314         """
315         sub_lang_list = self._get_available_subtitles(video_id)
316         sub_format = self._downloader.params.get('subtitlesformat')
317         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
318             return [(sub_lang_list[0], None, None)]
319         if self._downloader.params.get('subtitleslang', False):
320             sub_lang = self._downloader.params.get('subtitleslang')
321         elif 'en' in sub_lang_list:
322             sub_lang = 'en'
323         else:
324             sub_lang = list(sub_lang_list.keys())[0]
325         if not sub_lang in sub_lang_list:
326             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
327
328         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
329         return [subtitle]
330
331     def _extract_all_subtitles(self, video_id):
332         sub_lang_list = self._get_available_subtitles(video_id)
333         sub_format = self._downloader.params.get('subtitlesformat')
334         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
335             return [(sub_lang_list[0], None, None)]
336         subtitles = []
337         for sub_lang in sub_lang_list:
338             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339             subtitles.append(subtitle)
340         return subtitles
341
342     def _print_formats(self, formats):
343         print('Available formats:')
344         for x in formats:
345             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
346
347     def _real_initialize(self):
348         if self._downloader is None:
349             return
350
351         username = None
352         password = None
353         downloader_params = self._downloader.params
354
355         # Attempt to use provided username and password or .netrc data
356         if downloader_params.get('username', None) is not None:
357             username = downloader_params['username']
358             password = downloader_params['password']
359         elif downloader_params.get('usenetrc', False):
360             try:
361                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
362                 if info is not None:
363                     username = info[0]
364                     password = info[2]
365                 else:
366                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367             except (IOError, netrc.NetrcParseError) as err:
368                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
369                 return
370
371         # Set language
372         request = compat_urllib_request.Request(self._LANG_URL)
373         try:
374             self.report_lang()
375             compat_urllib_request.urlopen(request).read()
376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
378             return
379
380         # No authentication to be performed
381         if username is None:
382             return
383
384         request = compat_urllib_request.Request(self._LOGIN_URL)
385         try:
386             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
389             return
390
391         galx = None
392         dsh = None
393         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
394         if match:
395           galx = match.group(1)
396
397         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
398         if match:
399           dsh = match.group(1)
400
401         # Log in
402         login_form_strs = {
403                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
404                 u'Email': username,
405                 u'GALX': galx,
406                 u'Passwd': password,
407                 u'PersistentCookie': u'yes',
408                 u'_utf8': u'霱',
409                 u'bgresponse': u'js_disabled',
410                 u'checkConnection': u'',
411                 u'checkedDomains': u'youtube',
412                 u'dnConn': u'',
413                 u'dsh': dsh,
414                 u'pstMsg': u'0',
415                 u'rmShown': u'1',
416                 u'secTok': u'',
417                 u'signIn': u'Sign in',
418                 u'timeStmp': u'',
419                 u'service': u'youtube',
420                 u'uilel': u'3',
421                 u'hl': u'en_US',
422         }
423         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
424         # chokes on unicode
425         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
426         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
427         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
428         try:
429             self.report_login()
430             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
432                 self._downloader.report_warning(u'unable to log in: bad username or password')
433                 return
434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
436             return
437
438         # Confirm age
439         age_form = {
440                 'next_url':     '/',
441                 'action_confirm':   'Confirm',
442                 }
443         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
444         try:
445             self.report_age_confirmation()
446             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
448             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
449             return
450
451     def _extract_id(self, url):
452         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
453         if mobj is None:
454             self._downloader.report_error(u'invalid URL: %s' % url)
455             return
456         video_id = mobj.group(2)
457         return video_id
458
459     def _real_extract(self, url):
460         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
461         mobj = re.search(self._NEXT_URL_RE, url)
462         if mobj:
463             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
464         video_id = self._extract_id(url)
465
466         # Get video webpage
467         self.report_video_webpage_download(video_id)
468         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
469         request = compat_urllib_request.Request(url)
470         try:
471             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
473             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
474             return
475
476         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
477
478         # Attempt to extract SWF player URL
479         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
480         if mobj is not None:
481             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
482         else:
483             player_url = None
484
485         # Get video info
486         self.report_video_info_webpage_download(video_id)
487         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
488             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
489                     % (video_id, el_type))
490             request = compat_urllib_request.Request(video_info_url)
491             try:
492                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
493                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
494                 video_info = compat_parse_qs(video_info_webpage)
495                 if 'token' in video_info:
496                     break
497             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
499                 return
500         if 'token' not in video_info:
501             if 'reason' in video_info:
502                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
503             else:
504                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
505             return
506
507         # Check for "rental" videos
508         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
509             self._downloader.report_error(u'"rental" videos not supported')
510             return
511
512         # Start extracting information
513         self.report_information_extraction(video_id)
514
515         # uploader
516         if 'author' not in video_info:
517             self._downloader.report_error(u'unable to extract uploader name')
518             return
519         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
520
521         # uploader_id
522         video_uploader_id = None
523         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
524         if mobj is not None:
525             video_uploader_id = mobj.group(1)
526         else:
527             self._downloader.report_warning(u'unable to extract uploader nickname')
528
529         # title
530         if 'title' not in video_info:
531             self._downloader.report_error(u'unable to extract video title')
532             return
533         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
534
535         # thumbnail image
536         if 'thumbnail_url' not in video_info:
537             self._downloader.report_warning(u'unable to extract video thumbnail')
538             video_thumbnail = ''
539         else:   # don't panic if we can't find it
540             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
541
542         # upload date
543         upload_date = None
544         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
545         if mobj is not None:
546             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
547             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
548             for expression in format_expressions:
549                 try:
550                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
551                 except:
552                     pass
553
554         # description
555         video_description = get_element_by_id("eow-description", video_webpage)
556         if video_description:
557             video_description = clean_html(video_description)
558         else:
559             video_description = ''
560
561         # subtitles
562         video_subtitles = None
563
564         if self._downloader.params.get('writesubtitles', False):
565             video_subtitles = self._extract_subtitle(video_id)
566             if video_subtitles:
567                 (sub_error, sub_lang, sub) = video_subtitles[0]
568                 if sub_error:
569                     self._downloader.report_error(sub_error)
570
571         if self._downloader.params.get('allsubtitles', False):
572             video_subtitles = self._extract_all_subtitles(video_id)
573             for video_subtitle in video_subtitles:
574                 (sub_error, sub_lang, sub) = video_subtitle
575                 if sub_error:
576                     self._downloader.report_error(sub_error)
577
578         if self._downloader.params.get('listsubtitles', False):
579             sub_lang_list = self._list_available_subtitles(video_id)
580             return
581
582         if 'length_seconds' not in video_info:
583             self._downloader.report_warning(u'unable to extract video duration')
584             video_duration = ''
585         else:
586             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
587
588         # token
589         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
590
591         # Decide which formats to download
592         req_format = self._downloader.params.get('format', None)
593
594         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
595             self.report_rtmp_download()
596             video_url_list = [(None, video_info['conn'][0])]
597         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
598             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
599             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
600             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
601             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
602
603             format_limit = self._downloader.params.get('format_limit', None)
604             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
605             if format_limit is not None and format_limit in available_formats:
606                 format_list = available_formats[available_formats.index(format_limit):]
607             else:
608                 format_list = available_formats
609             existing_formats = [x for x in format_list if x in url_map]
610             if len(existing_formats) == 0:
611                 self._downloader.report_error(u'no known formats available for video')
612                 return
613             if self._downloader.params.get('listformats', None):
614                 self._print_formats(existing_formats)
615                 return
616             if req_format is None or req_format == 'best':
617                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
618             elif req_format == 'worst':
619                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
620             elif req_format in ('-1', 'all'):
621                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
622             else:
623                 # Specific formats. We pick the first in a slash-delimeted sequence.
624                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
625                 req_formats = req_format.split('/')
626                 video_url_list = None
627                 for rf in req_formats:
628                     if rf in url_map:
629                         video_url_list = [(rf, url_map[rf])]
630                         break
631                 if video_url_list is None:
632                     self._downloader.report_error(u'requested format not available')
633                     return
634         else:
635             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
636             return
637
638         results = []
639         for format_param, video_real_url in video_url_list:
640             # Extension
641             video_extension = self._video_extensions.get(format_param, 'flv')
642
643             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
644                                               self._video_dimensions.get(format_param, '???'))
645
646             results.append({
647                 'id':       video_id,
648                 'url':      video_real_url,
649                 'uploader': video_uploader,
650                 'uploader_id': video_uploader_id,
651                 'upload_date':  upload_date,
652                 'title':    video_title,
653                 'ext':      video_extension,
654                 'format':   video_format,
655                 'thumbnail':    video_thumbnail,
656                 'description':  video_description,
657                 'player_url':   player_url,
658                 'subtitles':    video_subtitles,
659                 'duration':     video_duration
660             })
661         return results
662
663
664 class MetacafeIE(InfoExtractor):
665     """Information Extractor for metacafe.com."""
666
667     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
668     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
669     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
670     IE_NAME = u'metacafe'
671
672     def __init__(self, downloader=None):
673         InfoExtractor.__init__(self, downloader)
674
675     def report_disclaimer(self):
676         """Report disclaimer retrieval."""
677         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
678
679     def report_age_confirmation(self):
680         """Report attempt to confirm age."""
681         self._downloader.to_screen(u'[metacafe] Confirming age')
682
683     def report_download_webpage(self, video_id):
684         """Report webpage download."""
685         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
686
687     def report_extraction(self, video_id):
688         """Report information extraction."""
689         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
690
691     def _real_initialize(self):
692         # Retrieve disclaimer
693         request = compat_urllib_request.Request(self._DISCLAIMER)
694         try:
695             self.report_disclaimer()
696             disclaimer = compat_urllib_request.urlopen(request).read()
697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
699             return
700
701         # Confirm age
702         disclaimer_form = {
703             'filters': '0',
704             'submit': "Continue - I'm over 18",
705             }
706         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707         try:
708             self.report_age_confirmation()
709             disclaimer = compat_urllib_request.urlopen(request).read()
710         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
712             return
713
714     def _real_extract(self, url):
715         # Extract id and simplified title from URL
716         mobj = re.match(self._VALID_URL, url)
717         if mobj is None:
718             self._downloader.report_error(u'invalid URL: %s' % url)
719             return
720
721         video_id = mobj.group(1)
722
723         # Check if video comes from YouTube
724         mobj2 = re.match(r'^yt-(.*)$', video_id)
725         if mobj2 is not None:
726             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
727
728         # Retrieve video webpage to extract further information
729         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
730         try:
731             self.report_download_webpage(video_id)
732             webpage = compat_urllib_request.urlopen(request).read()
733         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
734             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
735             return
736
737         # Extract URL, uploader and title from webpage
738         self.report_extraction(video_id)
739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
740         if mobj is not None:
741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742             video_extension = mediaURL[-3:]
743
744             # Extract gdaKey if available
745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
746             if mobj is None:
747                 video_url = mediaURL
748             else:
749                 gdaKey = mobj.group(1)
750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
751         else:
752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
753             if mobj is None:
754                 self._downloader.report_error(u'unable to extract media URL')
755                 return
756             vardict = compat_parse_qs(mobj.group(1))
757             if 'mediaData' not in vardict:
758                 self._downloader.report_error(u'unable to extract media URL')
759                 return
760             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
761             if mobj is None:
762                 self._downloader.report_error(u'unable to extract media URL')
763                 return
764             mediaURL = mobj.group(1).replace('\\/', '/')
765             video_extension = mediaURL[-3:]
766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
767
768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
769         if mobj is None:
770             self._downloader.report_error(u'unable to extract title')
771             return
772         video_title = mobj.group(1).decode('utf-8')
773
774         mobj = re.search(r'submitter=(.*?);', webpage)
775         if mobj is None:
776             self._downloader.report_error(u'unable to extract uploader nickname')
777             return
778         video_uploader = mobj.group(1)
779
780         return [{
781             'id':       video_id.decode('utf-8'),
782             'url':      video_url.decode('utf-8'),
783             'uploader': video_uploader.decode('utf-8'),
784             'upload_date':  None,
785             'title':    video_title,
786             'ext':      video_extension.decode('utf-8'),
787         }]
788
789
790 class DailymotionIE(InfoExtractor):
791     """Information Extractor for Dailymotion"""
792
793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794     IE_NAME = u'dailymotion'
795     _WORKING = False
796
797     def __init__(self, downloader=None):
798         InfoExtractor.__init__(self, downloader)
799
800     def report_extraction(self, video_id):
801         """Report information extraction."""
802         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
803
804     def _real_extract(self, url):
805         # Extract id and simplified title from URL
806         mobj = re.match(self._VALID_URL, url)
807         if mobj is None:
808             self._downloader.report_error(u'invalid URL: %s' % url)
809             return
810
811         video_id = mobj.group(1).split('_')[0].split('?')[0]
812
813         video_extension = 'mp4'
814
815         # Retrieve video webpage to extract further information
816         request = compat_urllib_request.Request(url)
817         request.add_header('Cookie', 'family_filter=off')
818         webpage = self._download_webpage(request, video_id)
819
820         # Extract URL, uploader and title from webpage
821         self.report_extraction(video_id)
822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
823         if mobj is None:
824             self._downloader.report_error(u'unable to extract media URL')
825             return
826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
827
828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
829             if key in flashvars:
830                 max_quality = key
831                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
832                 break
833         else:
834             self._downloader.report_error(u'unable to extract video URL')
835             return
836
837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
838         if mobj is None:
839             self._downloader.report_error(u'unable to extract video URL')
840             return
841
842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
843
844         # TODO: support choosing qualities
845
846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
847         if mobj is None:
848             self._downloader.report_error(u'unable to extract title')
849             return
850         video_title = unescapeHTML(mobj.group('title'))
851
852         video_uploader = None
853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
854         if mobj is None:
855             # lookin for official user
856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857             if mobj_official is None:
858                 self._downloader.report_warning(u'unable to extract uploader nickname')
859             else:
860                 video_uploader = mobj_official.group(1)
861         else:
862             video_uploader = mobj.group(1)
863
864         video_upload_date = None
865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
866         if mobj is not None:
867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
868
869         return [{
870             'id':       video_id,
871             'url':      video_url,
872             'uploader': video_uploader,
873             'upload_date':  video_upload_date,
874             'title':    video_title,
875             'ext':      video_extension,
876         }]
877
878
879 class PhotobucketIE(InfoExtractor):
880     """Information extractor for photobucket.com."""
881
882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883     IE_NAME = u'photobucket'
884
885     def __init__(self, downloader=None):
886         InfoExtractor.__init__(self, downloader)
887
888     def report_download_webpage(self, video_id):
889         """Report webpage download."""
890         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
891
892     def report_extraction(self, video_id):
893         """Report information extraction."""
894         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
895
896     def _real_extract(self, url):
897         # Extract id from URL
898         mobj = re.match(self._VALID_URL, url)
899         if mobj is None:
900             self._downloader.report_error(u'Invalid URL: %s' % url)
901             return
902
903         video_id = mobj.group(1)
904
905         video_extension = 'flv'
906
907         # Retrieve video webpage to extract further information
908         request = compat_urllib_request.Request(url)
909         try:
910             self.report_download_webpage(video_id)
911             webpage = compat_urllib_request.urlopen(request).read()
912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
914             return
915
916         # Extract URL, uploader, and title from webpage
917         self.report_extraction(video_id)
918         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
919         if mobj is None:
920             self._downloader.report_error(u'unable to extract media URL')
921             return
922         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
923
924         video_url = mediaURL
925
926         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
927         if mobj is None:
928             self._downloader.report_error(u'unable to extract title')
929             return
930         video_title = mobj.group(1).decode('utf-8')
931
932         video_uploader = mobj.group(2).decode('utf-8')
933
934         return [{
935             'id':       video_id.decode('utf-8'),
936             'url':      video_url.decode('utf-8'),
937             'uploader': video_uploader,
938             'upload_date':  None,
939             'title':    video_title,
940             'ext':      video_extension.decode('utf-8'),
941         }]
942
943
944 class YahooIE(InfoExtractor):
945     """Information extractor for video.yahoo.com."""
946
947     _WORKING = False
948     # _VALID_URL matches all Yahoo! Video URLs
949     # _VPAGE_URL matches only the extractable '/watch/' URLs
950     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952     IE_NAME = u'video.yahoo'
953
954     def __init__(self, downloader=None):
955         InfoExtractor.__init__(self, downloader)
956
957     def report_download_webpage(self, video_id):
958         """Report webpage download."""
959         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
960
961     def report_extraction(self, video_id):
962         """Report information extraction."""
963         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
964
965     def _real_extract(self, url, new_video=True):
966         # Extract ID from URL
967         mobj = re.match(self._VALID_URL, url)
968         if mobj is None:
969             self._downloader.report_error(u'Invalid URL: %s' % url)
970             return
971
972         video_id = mobj.group(2)
973         video_extension = 'flv'
974
975         # Rewrite valid but non-extractable URLs as
976         # extractable English language /watch/ URLs
977         if re.match(self._VPAGE_URL, url) is None:
978             request = compat_urllib_request.Request(url)
979             try:
980                 webpage = compat_urllib_request.urlopen(request).read()
981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
983                 return
984
985             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
986             if mobj is None:
987                 self._downloader.report_error(u'Unable to extract id field')
988                 return
989             yahoo_id = mobj.group(1)
990
991             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
992             if mobj is None:
993                 self._downloader.report_error(u'Unable to extract vid field')
994                 return
995             yahoo_vid = mobj.group(1)
996
997             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998             return self._real_extract(url, new_video=False)
999
1000         # Retrieve video webpage to extract further information
1001         request = compat_urllib_request.Request(url)
1002         try:
1003             self.report_download_webpage(video_id)
1004             webpage = compat_urllib_request.urlopen(request).read()
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Extract uploader and title from webpage
1010         self.report_extraction(video_id)
1011         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1012         if mobj is None:
1013             self._downloader.report_error(u'unable to extract video title')
1014             return
1015         video_title = mobj.group(1).decode('utf-8')
1016
1017         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video uploader')
1020             return
1021         video_uploader = mobj.group(1).decode('utf-8')
1022
1023         # Extract video thumbnail
1024         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video thumbnail')
1027             return
1028         video_thumbnail = mobj.group(1).decode('utf-8')
1029
1030         # Extract video description
1031         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video description')
1034             return
1035         video_description = mobj.group(1).decode('utf-8')
1036         if not video_description:
1037             video_description = 'No description available.'
1038
1039         # Extract video height and width
1040         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1041         if mobj is None:
1042             self._downloader.report_error(u'unable to extract video height')
1043             return
1044         yv_video_height = mobj.group(1)
1045
1046         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1047         if mobj is None:
1048             self._downloader.report_error(u'unable to extract video width')
1049             return
1050         yv_video_width = mobj.group(1)
1051
1052         # Retrieve video playlist to extract media URL
1053         # I'm not completely sure what all these options are, but we
1054         # seem to need most of them, otherwise the server sends a 401.
1055         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1056         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1057         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Extract media URL from playlist XML
1068         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1069         if mobj is None:
1070             self._downloader.report_error(u'Unable to extract media URL')
1071             return
1072         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073         video_url = unescapeHTML(video_url)
1074
1075         return [{
1076             'id':       video_id.decode('utf-8'),
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'upload_date':  None,
1080             'title':    video_title,
1081             'ext':      video_extension.decode('utf-8'),
1082             'thumbnail':    video_thumbnail.decode('utf-8'),
1083             'description':  video_description,
1084         }]
1085
1086
1087 class VimeoIE(InfoExtractor):
1088     """Information extractor for vimeo.com."""
1089
1090     # _VALID_URL matches Vimeo URLs
1091     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092     IE_NAME = u'vimeo'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1104
1105     def _real_extract(self, url, new_video=True):
1106         # Extract ID from URL
1107         mobj = re.match(self._VALID_URL, url)
1108         if mobj is None:
1109             self._downloader.report_error(u'Invalid URL: %s' % url)
1110             return
1111
1112         video_id = mobj.group('id')
1113         if not mobj.group('proto'):
1114             url = 'https://' + url
1115         if mobj.group('direct_link'):
1116             url = 'https://vimeo.com/' + video_id
1117
1118         # Retrieve video webpage to extract further information
1119         request = compat_urllib_request.Request(url, None, std_headers)
1120         try:
1121             self.report_download_webpage(video_id)
1122             webpage_bytes = compat_urllib_request.urlopen(request).read()
1123             webpage = webpage_bytes.decode('utf-8')
1124         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1126             return
1127
1128         # Now we begin extracting as much information as we can from what we
1129         # retrieved. First we extract the information common to all extractors,
1130         # and latter we extract those that are Vimeo specific.
1131         self.report_extraction(video_id)
1132
1133         # Extract the config JSON
1134         try:
1135             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136             config = json.loads(config)
1137         except:
1138             self._downloader.report_error(u'unable to extract info section')
1139             return
1140
1141         # Extract title
1142         video_title = config["video"]["title"]
1143
1144         # Extract uploader and uploader_id
1145         video_uploader = config["video"]["owner"]["name"]
1146         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1147
1148         # Extract video thumbnail
1149         video_thumbnail = config["video"]["thumbnail"]
1150
1151         # Extract video description
1152         video_description = get_element_by_attribute("itemprop", "description", webpage)
1153         if video_description: video_description = clean_html(video_description)
1154         else: video_description = ''
1155
1156         # Extract upload date
1157         video_upload_date = None
1158         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159         if mobj is not None:
1160             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1161
1162         # Vimeo specific: extract request signature and timestamp
1163         sig = config['request']['signature']
1164         timestamp = config['request']['timestamp']
1165
1166         # Vimeo specific: extract video codec and quality information
1167         # First consider quality, then codecs, then take everything
1168         # TODO bind to format param
1169         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170         files = { 'hd': [], 'sd': [], 'other': []}
1171         for codec_name, codec_extension in codecs:
1172             if codec_name in config["video"]["files"]:
1173                 if 'hd' in config["video"]["files"][codec_name]:
1174                     files['hd'].append((codec_name, codec_extension, 'hd'))
1175                 elif 'sd' in config["video"]["files"][codec_name]:
1176                     files['sd'].append((codec_name, codec_extension, 'sd'))
1177                 else:
1178                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1179
1180         for quality in ('hd', 'sd', 'other'):
1181             if len(files[quality]) > 0:
1182                 video_quality = files[quality][0][2]
1183                 video_codec = files[quality][0][0]
1184                 video_extension = files[quality][0][1]
1185                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1186                 break
1187         else:
1188             self._downloader.report_error(u'no known codec found')
1189             return
1190
1191         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1193
1194         return [{
1195             'id':       video_id,
1196             'url':      video_url,
1197             'uploader': video_uploader,
1198             'uploader_id': video_uploader_id,
1199             'upload_date':  video_upload_date,
1200             'title':    video_title,
1201             'ext':      video_extension,
1202             'thumbnail':    video_thumbnail,
1203             'description':  video_description,
1204         }]
1205
1206
1207 class ArteTvIE(InfoExtractor):
1208     """arte.tv information extractor."""
1209
1210     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211     _LIVE_URL = r'index-[0-9]+\.html$'
1212
1213     IE_NAME = u'arte.tv'
1214
1215     def __init__(self, downloader=None):
1216         InfoExtractor.__init__(self, downloader)
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1221
1222     def report_extraction(self, video_id):
1223         """Report information extraction."""
1224         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233             return
1234         except ValueError as err:
1235             self._downloader.report_error(u'Invalid URL: %s' % url)
1236             return
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             self._downloader.report_error(u'Invalid URL: %s' % url)
1246             return
1247
1248         for (i, key, err) in matchTuples:
1249             if mobj.group(i) is None:
1250                 self._downloader.trouble(err)
1251                 return
1252             else:
1253                 info[key] = mobj.group(i)
1254
1255         return info
1256
1257     def extractLiveStream(self, url):
1258         video_lang = url.split('/')[-4]
1259         info = self.grep_webpage(
1260             url,
1261             r'src="(.*?/videothek_js.*?\.js)',
1262             0,
1263             [
1264                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1265             ]
1266         )
1267         http_host = url.split('/')[2]
1268         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269         info = self.grep_webpage(
1270             next_url,
1271             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272                 '(http://.*?\.swf).*?' +
1273                 '(rtmp://.*?)\'',
1274             re.DOTALL,
1275             [
1276                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1277                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1279             ]
1280         )
1281         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1282
1283     def extractPlus7Stream(self, url):
1284         video_lang = url.split('/')[-3]
1285         info = self.grep_webpage(
1286             url,
1287             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288             0,
1289             [
1290                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1291             ]
1292         )
1293         next_url = compat_urllib_parse.unquote(info.get('url'))
1294         info = self.grep_webpage(
1295             next_url,
1296             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297             0,
1298             [
1299                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1300             ]
1301         )
1302         next_url = compat_urllib_parse.unquote(info.get('url'))
1303
1304         info = self.grep_webpage(
1305             next_url,
1306             r'<video id="(.*?)".*?>.*?' +
1307                 '<name>(.*?)</name>.*?' +
1308                 '<dateVideo>(.*?)</dateVideo>.*?' +
1309                 '<url quality="hd">(.*?)</url>',
1310             re.DOTALL,
1311             [
1312                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1313                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1315                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1316             ]
1317         )
1318
1319         return {
1320             'id':           info.get('id'),
1321             'url':          compat_urllib_parse.unquote(info.get('url')),
1322             'uploader':     u'arte.tv',
1323             'upload_date':  info.get('date'),
1324             'title':        info.get('title').decode('utf-8'),
1325             'ext':          u'mp4',
1326             'format':       u'NA',
1327             'player_url':   None,
1328         }
1329
1330     def _real_extract(self, url):
1331         video_id = url.split('/')[-1]
1332         self.report_extraction(video_id)
1333
1334         if re.search(self._LIVE_URL, video_id) is not None:
1335             self.extractLiveStream(url)
1336             return
1337         else:
1338             info = self.extractPlus7Stream(url)
1339
1340         return [info]
1341
1342
1343 class GenericIE(InfoExtractor):
1344     """Generic last-resort information extractor."""
1345
1346     _VALID_URL = r'.*'
1347     IE_NAME = u'generic'
1348
1349     def __init__(self, downloader=None):
1350         InfoExtractor.__init__(self, downloader)
1351
1352     def report_download_webpage(self, video_id):
1353         """Report webpage download."""
1354         if not self._downloader.params.get('test', False):
1355             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1357
1358     def report_extraction(self, video_id):
1359         """Report information extraction."""
1360         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1361
1362     def report_following_redirect(self, new_url):
1363         """Report information extraction."""
1364         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1365
1366     def _test_redirect(self, url):
1367         """Check if it is a redirect, like url shorteners, in case return the new url."""
1368         class HeadRequest(compat_urllib_request.Request):
1369             def get_method(self):
1370                 return "HEAD"
1371
1372         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1373             """
1374             Subclass the HTTPRedirectHandler to make it use our
1375             HeadRequest also on the redirected URL
1376             """
1377             def redirect_request(self, req, fp, code, msg, headers, newurl):
1378                 if code in (301, 302, 303, 307):
1379                     newurl = newurl.replace(' ', '%20')
1380                     newheaders = dict((k,v) for k,v in req.headers.items()
1381                                       if k.lower() not in ("content-length", "content-type"))
1382                     return HeadRequest(newurl,
1383                                        headers=newheaders,
1384                                        origin_req_host=req.get_origin_req_host(),
1385                                        unverifiable=True)
1386                 else:
1387                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1388
1389         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1390             """
1391             Fallback to GET if HEAD is not allowed (405 HTTP error)
1392             """
1393             def http_error_405(self, req, fp, code, msg, headers):
1394                 fp.read()
1395                 fp.close()
1396
1397                 newheaders = dict((k,v) for k,v in req.headers.items()
1398                                   if k.lower() not in ("content-length", "content-type"))
1399                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1400                                                  headers=newheaders,
1401                                                  origin_req_host=req.get_origin_req_host(),
1402                                                  unverifiable=True))
1403
1404         # Build our opener
1405         opener = compat_urllib_request.OpenerDirector()
1406         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407                         HTTPMethodFallback, HEADRedirectHandler,
1408                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409             opener.add_handler(handler())
1410
1411         response = opener.open(HeadRequest(url))
1412         new_url = response.geturl()
1413
1414         if url == new_url:
1415             return False
1416
1417         self.report_following_redirect(new_url)
1418         return new_url
1419
1420     def _real_extract(self, url):
1421         new_url = self._test_redirect(url)
1422         if new_url: return [self.url_result(new_url)]
1423
1424         video_id = url.split('/')[-1]
1425         try:
1426             webpage = self._download_webpage(url, video_id)
1427         except ValueError as err:
1428             # since this is the last-resort InfoExtractor, if
1429             # this error is thrown, it'll be thrown here
1430             self._downloader.report_error(u'Invalid URL: %s' % url)
1431             return
1432
1433         self.report_extraction(video_id)
1434         # Start with something easy: JW Player in SWFObject
1435         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1436         if mobj is None:
1437             # Broaden the search a little bit
1438             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1439         if mobj is None:
1440             # Broaden the search a little bit: JWPlayer JS loader
1441             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'Invalid URL: %s' % url)
1444             return
1445
1446         # It's possible that one of the regexes
1447         # matched, but returned an empty group:
1448         if mobj.group(1) is None:
1449             self._downloader.report_error(u'Invalid URL: %s' % url)
1450             return
1451
1452         video_url = compat_urllib_parse.unquote(mobj.group(1))
1453         video_id = os.path.basename(video_url)
1454
1455         # here's a fun little line of code for you:
1456         video_extension = os.path.splitext(video_id)[1][1:]
1457         video_id = os.path.splitext(video_id)[0]
1458
1459         # it's tempting to parse this further, but you would
1460         # have to take into account all the variations like
1461         #   Video Title - Site Name
1462         #   Site Name | Video Title
1463         #   Video Title - Tagline | Site Name
1464         # and so on and so forth; it's just not practical
1465         mobj = re.search(r'<title>(.*)</title>', webpage)
1466         if mobj is None:
1467             self._downloader.report_error(u'unable to extract title')
1468             return
1469         video_title = mobj.group(1)
1470
1471         # video uploader is domain name
1472         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1473         if mobj is None:
1474             self._downloader.report_error(u'unable to extract title')
1475             return
1476         video_uploader = mobj.group(1)
1477
1478         return [{
1479             'id':       video_id,
1480             'url':      video_url,
1481             'uploader': video_uploader,
1482             'upload_date':  None,
1483             'title':    video_title,
1484             'ext':      video_extension,
1485         }]
1486
1487
1488 class YoutubeSearchIE(InfoExtractor):
1489     """Information Extractor for YouTube search queries."""
1490     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492     _max_youtube_results = 1000
1493     IE_NAME = u'youtube:search'
1494
1495     def __init__(self, downloader=None):
1496         InfoExtractor.__init__(self, downloader)
1497
1498     def report_download_page(self, query, pagenum):
1499         """Report attempt to download search page with given number."""
1500         query = query.decode(preferredencoding())
1501         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1502
1503     def _real_extract(self, query):
1504         mobj = re.match(self._VALID_URL, query)
1505         if mobj is None:
1506             self._downloader.report_error(u'invalid search query "%s"' % query)
1507             return
1508
1509         prefix, query = query.split(':')
1510         prefix = prefix[8:]
1511         query = query.encode('utf-8')
1512         if prefix == '':
1513             self._download_n_results(query, 1)
1514             return
1515         elif prefix == 'all':
1516             self._download_n_results(query, self._max_youtube_results)
1517             return
1518         else:
1519             try:
1520                 n = int(prefix)
1521                 if n <= 0:
1522                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1523                     return
1524                 elif n > self._max_youtube_results:
1525                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526                     n = self._max_youtube_results
1527                 self._download_n_results(query, n)
1528                 return
1529             except ValueError: # parsing prefix as integer fails
1530                 self._download_n_results(query, 1)
1531                 return
1532
1533     def _download_n_results(self, query, n):
1534         """Downloads a specified number of results for a query"""
1535
1536         video_ids = []
1537         pagenum = 0
1538         limit = n
1539
1540         while (50 * pagenum) < limit:
1541             self.report_download_page(query, pagenum+1)
1542             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1548                 return
1549             api_response = json.loads(data)['data']
1550
1551             if not 'items' in api_response:
1552                 self._downloader.trouble(u'[youtube] No video results')
1553                 return
1554
1555             new_ids = list(video['id'] for video in api_response['items'])
1556             video_ids += new_ids
1557
1558             limit = min(n, api_response['totalItems'])
1559             pagenum += 1
1560
1561         if len(video_ids) > n:
1562             video_ids = video_ids[:n]
1563         for id in video_ids:
1564             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565         return
1566
1567
1568 class GoogleSearchIE(InfoExtractor):
1569     """Information Extractor for Google Video search queries."""
1570     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574     _max_google_results = 1000
1575     IE_NAME = u'video.google:search'
1576
1577     def __init__(self, downloader=None):
1578         InfoExtractor.__init__(self, downloader)
1579
1580     def report_download_page(self, query, pagenum):
1581         """Report attempt to download playlist page with given number."""
1582         query = query.decode(preferredencoding())
1583         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1584
1585     def _real_extract(self, query):
1586         mobj = re.match(self._VALID_URL, query)
1587         if mobj is None:
1588             self._downloader.report_error(u'invalid search query "%s"' % query)
1589             return
1590
1591         prefix, query = query.split(':')
1592         prefix = prefix[8:]
1593         query = query.encode('utf-8')
1594         if prefix == '':
1595             self._download_n_results(query, 1)
1596             return
1597         elif prefix == 'all':
1598             self._download_n_results(query, self._max_google_results)
1599             return
1600         else:
1601             try:
1602                 n = int(prefix)
1603                 if n <= 0:
1604                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605                     return
1606                 elif n > self._max_google_results:
1607                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608                     n = self._max_google_results
1609                 self._download_n_results(query, n)
1610                 return
1611             except ValueError: # parsing prefix as integer fails
1612                 self._download_n_results(query, 1)
1613                 return
1614
1615     def _download_n_results(self, query, n):
1616         """Downloads a specified number of results for a query"""
1617
1618         video_ids = []
1619         pagenum = 0
1620
1621         while True:
1622             self.report_download_page(query, pagenum)
1623             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624             request = compat_urllib_request.Request(result_url)
1625             try:
1626                 page = compat_urllib_request.urlopen(request).read()
1627             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1629                 return
1630
1631             # Extract video identifiers
1632             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633                 video_id = mobj.group(1)
1634                 if video_id not in video_ids:
1635                     video_ids.append(video_id)
1636                     if len(video_ids) == n:
1637                         # Specified n videos reached
1638                         for id in video_ids:
1639                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1640                         return
1641
1642             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643                 for id in video_ids:
1644                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                 return
1646
1647             pagenum = pagenum + 1
1648
1649
1650 class YahooSearchIE(InfoExtractor):
1651     """Information Extractor for Yahoo! Video search queries."""
1652
1653     _WORKING = False
1654     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657     _MORE_PAGES_INDICATOR = r'\s*Next'
1658     _max_yahoo_results = 1000
1659     IE_NAME = u'video.yahoo:search'
1660
1661     def __init__(self, downloader=None):
1662         InfoExtractor.__init__(self, downloader)
1663
1664     def report_download_page(self, query, pagenum):
1665         """Report attempt to download playlist page with given number."""
1666         query = query.decode(preferredencoding())
1667         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1668
1669     def _real_extract(self, query):
1670         mobj = re.match(self._VALID_URL, query)
1671         if mobj is None:
1672             self._downloader.report_error(u'invalid search query "%s"' % query)
1673             return
1674
1675         prefix, query = query.split(':')
1676         prefix = prefix[8:]
1677         query = query.encode('utf-8')
1678         if prefix == '':
1679             self._download_n_results(query, 1)
1680             return
1681         elif prefix == 'all':
1682             self._download_n_results(query, self._max_yahoo_results)
1683             return
1684         else:
1685             try:
1686                 n = int(prefix)
1687                 if n <= 0:
1688                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1689                     return
1690                 elif n > self._max_yahoo_results:
1691                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692                     n = self._max_yahoo_results
1693                 self._download_n_results(query, n)
1694                 return
1695             except ValueError: # parsing prefix as integer fails
1696                 self._download_n_results(query, 1)
1697                 return
1698
1699     def _download_n_results(self, query, n):
1700         """Downloads a specified number of results for a query"""
1701
1702         video_ids = []
1703         already_seen = set()
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(query, pagenum)
1708             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709             request = compat_urllib_request.Request(result_url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read()
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 video_id = mobj.group(1)
1719                 if video_id not in already_seen:
1720                     video_ids.append(video_id)
1721                     already_seen.add(video_id)
1722                     if len(video_ids) == n:
1723                         # Specified n videos reached
1724                         for id in video_ids:
1725                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1726                         return
1727
1728             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729                 for id in video_ids:
1730                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                 return
1732
1733             pagenum = pagenum + 1
1734
1735
1736 class YoutubePlaylistIE(InfoExtractor):
1737     """Information Extractor for YouTube playlists."""
1738
1739     _VALID_URL = r"""(?:
1740                         (?:https?://)?
1741                         (?:\w+\.)?
1742                         youtube\.com/
1743                         (?:
1744                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745                            \? (?:.*?&)*? (?:p|a|list)=
1746                         |  p/
1747                         )
1748                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1749                         .*
1750                      |
1751                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1752                      )"""
1753     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1754     _MAX_RESULTS = 50
1755     IE_NAME = u'youtube:playlist'
1756
1757     def __init__(self, downloader=None):
1758         InfoExtractor.__init__(self, downloader)
1759
1760     @classmethod
1761     def suitable(cls, url):
1762         """Receives a URL and returns True if suitable for this IE."""
1763         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1764
1765     def report_download_page(self, playlist_id, pagenum):
1766         """Report attempt to download playlist page with given number."""
1767         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1768
1769     def _real_extract(self, url):
1770         # Extract playlist id
1771         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1772         if mobj is None:
1773             self._downloader.report_error(u'invalid url: %s' % url)
1774             return
1775
1776         # Download playlist videos from API
1777         playlist_id = mobj.group(1) or mobj.group(2)
1778         page_num = 1
1779         videos = []
1780
1781         while True:
1782             self.report_download_page(playlist_id, page_num)
1783
1784             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1785             try:
1786                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             try:
1792                 response = json.loads(page)
1793             except ValueError as err:
1794                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1795                 return
1796
1797             if not 'feed' in response or not 'entry' in response['feed']:
1798                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1799                 return
1800             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1801                         for entry in response['feed']['entry']
1802                         if 'content' in entry ]
1803
1804             if len(response['feed']['entry']) < self._MAX_RESULTS:
1805                 break
1806             page_num += 1
1807
1808         videos = [v[1] for v in sorted(videos)]
1809         total = len(videos)
1810
1811         playliststart = self._downloader.params.get('playliststart', 1) - 1
1812         playlistend = self._downloader.params.get('playlistend', -1)
1813         if playlistend == -1:
1814             videos = videos[playliststart:]
1815         else:
1816             videos = videos[playliststart:playlistend]
1817
1818         if len(videos) == total:
1819             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1820         else:
1821             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1822
1823         url_results = [self.url_result(url) for url in videos]
1824         return [self.playlist_result(url_results, playlist_id)]
1825
1826
1827 class YoutubeChannelIE(InfoExtractor):
1828     """Information Extractor for YouTube channels."""
1829
1830     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1831     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1832     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1833     IE_NAME = u'youtube:channel'
1834
1835     def report_download_page(self, channel_id, pagenum):
1836         """Report attempt to download channel page with given number."""
1837         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1838
1839     def _real_extract(self, url):
1840         # Extract channel id
1841         mobj = re.match(self._VALID_URL, url)
1842         if mobj is None:
1843             self._downloader.report_error(u'invalid url: %s' % url)
1844             return
1845
1846         # Download channel pages
1847         channel_id = mobj.group(1)
1848         video_ids = []
1849         pagenum = 1
1850
1851         while True:
1852             self.report_download_page(channel_id, pagenum)
1853             url = self._TEMPLATE_URL % (channel_id, pagenum)
1854             request = compat_urllib_request.Request(url)
1855             try:
1856                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1857             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1859                 return
1860
1861             # Extract video identifiers
1862             ids_in_page = []
1863             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1864                 if mobj.group(1) not in ids_in_page:
1865                     ids_in_page.append(mobj.group(1))
1866             video_ids.extend(ids_in_page)
1867
1868             if self._MORE_PAGES_INDICATOR not in page:
1869                 break
1870             pagenum = pagenum + 1
1871
1872         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1873
1874         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1875         url_entries = [self.url_result(url) for url in urls]
1876         return [self.playlist_result(url_entries, channel_id)]
1877
1878
1879 class YoutubeUserIE(InfoExtractor):
1880     """Information Extractor for YouTube users."""
1881
1882     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1883     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1884     _GDATA_PAGE_SIZE = 50
1885     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1886     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1887     IE_NAME = u'youtube:user'
1888
1889     def __init__(self, downloader=None):
1890         InfoExtractor.__init__(self, downloader)
1891
1892     def report_download_page(self, username, start_index):
1893         """Report attempt to download user page."""
1894         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1895                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1896
1897     def _real_extract(self, url):
1898         # Extract username
1899         mobj = re.match(self._VALID_URL, url)
1900         if mobj is None:
1901             self._downloader.report_error(u'invalid url: %s' % url)
1902             return
1903
1904         username = mobj.group(1)
1905
1906         # Download video ids using YouTube Data API. Result size per
1907         # query is limited (currently to 50 videos) so we need to query
1908         # page by page until there are no video ids - it means we got
1909         # all of them.
1910
1911         video_ids = []
1912         pagenum = 0
1913
1914         while True:
1915             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1916             self.report_download_page(username, start_index)
1917
1918             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1919
1920             try:
1921                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1922             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1923                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1924                 return
1925
1926             # Extract video identifiers
1927             ids_in_page = []
1928
1929             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1930                 if mobj.group(1) not in ids_in_page:
1931                     ids_in_page.append(mobj.group(1))
1932
1933             video_ids.extend(ids_in_page)
1934
1935             # A little optimization - if current page is not
1936             # "full", ie. does not contain PAGE_SIZE video ids then
1937             # we can assume that this page is the last one - there
1938             # are no more ids on further pages - no need to query
1939             # again.
1940
1941             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1942                 break
1943
1944             pagenum += 1
1945
1946         all_ids_count = len(video_ids)
1947         playliststart = self._downloader.params.get('playliststart', 1) - 1
1948         playlistend = self._downloader.params.get('playlistend', -1)
1949
1950         if playlistend == -1:
1951             video_ids = video_ids[playliststart:]
1952         else:
1953             video_ids = video_ids[playliststart:playlistend]
1954
1955         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1956                 (username, all_ids_count, len(video_ids)))
1957
1958         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1959         url_results = [self.url_result(url) for url in urls]
1960         return [self.playlist_result(url_results, playlist_title = username)]
1961
1962
1963 class BlipTVUserIE(InfoExtractor):
1964     """Information Extractor for blip.tv users."""
1965
1966     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1967     _PAGE_SIZE = 12
1968     IE_NAME = u'blip.tv:user'
1969
1970     def __init__(self, downloader=None):
1971         InfoExtractor.__init__(self, downloader)
1972
1973     def report_download_page(self, username, pagenum):
1974         """Report attempt to download user page."""
1975         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1976                 (self.IE_NAME, username, pagenum))
1977
1978     def _real_extract(self, url):
1979         # Extract username
1980         mobj = re.match(self._VALID_URL, url)
1981         if mobj is None:
1982             self._downloader.report_error(u'invalid url: %s' % url)
1983             return
1984
1985         username = mobj.group(1)
1986
1987         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1988
1989         request = compat_urllib_request.Request(url)
1990
1991         try:
1992             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1993             mobj = re.search(r'data-users-id="([^"]+)"', page)
1994             page_base = page_base % mobj.group(1)
1995         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1997             return
1998
1999
2000         # Download video ids using BlipTV Ajax calls. Result size per
2001         # query is limited (currently to 12 videos) so we need to query
2002         # page by page until there are no video ids - it means we got
2003         # all of them.
2004
2005         video_ids = []
2006         pagenum = 1
2007
2008         while True:
2009             self.report_download_page(username, pagenum)
2010             url = page_base + "&page=" + str(pagenum)
2011             request = compat_urllib_request.Request( url )
2012             try:
2013                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2014             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2015                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2016                 return
2017
2018             # Extract video identifiers
2019             ids_in_page = []
2020
2021             for mobj in re.finditer(r'href="/([^"]+)"', page):
2022                 if mobj.group(1) not in ids_in_page:
2023                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2024
2025             video_ids.extend(ids_in_page)
2026
2027             # A little optimization - if current page is not
2028             # "full", ie. does not contain PAGE_SIZE video ids then
2029             # we can assume that this page is the last one - there
2030             # are no more ids on further pages - no need to query
2031             # again.
2032
2033             if len(ids_in_page) < self._PAGE_SIZE:
2034                 break
2035
2036             pagenum += 1
2037
2038         all_ids_count = len(video_ids)
2039         playliststart = self._downloader.params.get('playliststart', 1) - 1
2040         playlistend = self._downloader.params.get('playlistend', -1)
2041
2042         if playlistend == -1:
2043             video_ids = video_ids[playliststart:]
2044         else:
2045             video_ids = video_ids[playliststart:playlistend]
2046
2047         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2048                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2049
2050         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2051         url_entries = [self.url_result(url) for url in urls]
2052         return [self.playlist_result(url_entries, playlist_title = username)]
2053
2054
2055 class DepositFilesIE(InfoExtractor):
2056     """Information extractor for depositfiles.com"""
2057
2058     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2059
2060     def report_download_webpage(self, file_id):
2061         """Report webpage download."""
2062         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2063
2064     def report_extraction(self, file_id):
2065         """Report information extraction."""
2066         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2067
2068     def _real_extract(self, url):
2069         file_id = url.split('/')[-1]
2070         # Rebuild url in english locale
2071         url = 'http://depositfiles.com/en/files/' + file_id
2072
2073         # Retrieve file webpage with 'Free download' button pressed
2074         free_download_indication = { 'gateway_result' : '1' }
2075         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2076         try:
2077             self.report_download_webpage(file_id)
2078             webpage = compat_urllib_request.urlopen(request).read()
2079         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2080             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2081             return
2082
2083         # Search for the real file URL
2084         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2085         if (mobj is None) or (mobj.group(1) is None):
2086             # Try to figure out reason of the error.
2087             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2088             if (mobj is not None) and (mobj.group(1) is not None):
2089                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2090                 self._downloader.report_error(u'%s' % restriction_message)
2091             else:
2092                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2093             return
2094
2095         file_url = mobj.group(1)
2096         file_extension = os.path.splitext(file_url)[1][1:]
2097
2098         # Search for file title
2099         mobj = re.search(r'<b title="(.*?)">', webpage)
2100         if mobj is None:
2101             self._downloader.report_error(u'unable to extract title')
2102             return
2103         file_title = mobj.group(1).decode('utf-8')
2104
2105         return [{
2106             'id':       file_id.decode('utf-8'),
2107             'url':      file_url.decode('utf-8'),
2108             'uploader': None,
2109             'upload_date':  None,
2110             'title':    file_title,
2111             'ext':      file_extension.decode('utf-8'),
2112         }]
2113
2114
2115 class FacebookIE(InfoExtractor):
2116     """Information Extractor for Facebook"""
2117
2118     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2119     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2120     _NETRC_MACHINE = 'facebook'
2121     IE_NAME = u'facebook'
2122
2123     def report_login(self):
2124         """Report attempt to log in."""
2125         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2126
2127     def _real_initialize(self):
2128         if self._downloader is None:
2129             return
2130
2131         useremail = None
2132         password = None
2133         downloader_params = self._downloader.params
2134
2135         # Attempt to use provided username and password or .netrc data
2136         if downloader_params.get('username', None) is not None:
2137             useremail = downloader_params['username']
2138             password = downloader_params['password']
2139         elif downloader_params.get('usenetrc', False):
2140             try:
2141                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2142                 if info is not None:
2143                     useremail = info[0]
2144                     password = info[2]
2145                 else:
2146                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2147             except (IOError, netrc.NetrcParseError) as err:
2148                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2149                 return
2150
2151         if useremail is None:
2152             return
2153
2154         # Log in
2155         login_form = {
2156             'email': useremail,
2157             'pass': password,
2158             'login': 'Log+In'
2159             }
2160         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2161         try:
2162             self.report_login()
2163             login_results = compat_urllib_request.urlopen(request).read()
2164             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2165                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2166                 return
2167         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2168             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2169             return
2170
2171     def _real_extract(self, url):
2172         mobj = re.match(self._VALID_URL, url)
2173         if mobj is None:
2174             self._downloader.report_error(u'invalid URL: %s' % url)
2175             return
2176         video_id = mobj.group('ID')
2177
2178         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2179         webpage = self._download_webpage(url, video_id)
2180
2181         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2182         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2183         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2184         if not m:
2185             raise ExtractorError(u'Cannot parse data')
2186         data = dict(json.loads(m.group(1)))
2187         params_raw = compat_urllib_parse.unquote(data['params'])
2188         params = json.loads(params_raw)
2189         video_url = params['hd_src']
2190         if not video_url:
2191             video_url = params['sd_src']
2192         if not video_url:
2193             raise ExtractorError(u'Cannot find video URL')
2194         video_duration = int(params['video_duration'])
2195
2196         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2197         if not m:
2198             raise ExtractorError(u'Cannot find title in webpage')
2199         video_title = unescapeHTML(m.group(1))
2200
2201         info = {
2202             'id': video_id,
2203             'title': video_title,
2204             'url': video_url,
2205             'ext': 'mp4',
2206             'duration': video_duration,
2207             'thumbnail': params['thumbnail_src'],
2208         }
2209         return [info]
2210
2211
2212 class BlipTVIE(InfoExtractor):
2213     """Information extractor for blip.tv"""
2214
2215     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2216     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2217     IE_NAME = u'blip.tv'
2218
2219     def report_extraction(self, file_id):
2220         """Report information extraction."""
2221         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2222
2223     def report_direct_download(self, title):
2224         """Report information extraction."""
2225         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2226
2227     def _real_extract(self, url):
2228         mobj = re.match(self._VALID_URL, url)
2229         if mobj is None:
2230             self._downloader.report_error(u'invalid URL: %s' % url)
2231             return
2232
2233         urlp = compat_urllib_parse_urlparse(url)
2234         if urlp.path.startswith('/play/'):
2235             request = compat_urllib_request.Request(url)
2236             response = compat_urllib_request.urlopen(request)
2237             redirecturl = response.geturl()
2238             rurlp = compat_urllib_parse_urlparse(redirecturl)
2239             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2240             url = 'http://blip.tv/a/a-' + file_id
2241             return self._real_extract(url)
2242
2243
2244         if '?' in url:
2245             cchar = '&'
2246         else:
2247             cchar = '?'
2248         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2249         request = compat_urllib_request.Request(json_url)
2250         request.add_header('User-Agent', 'iTunes/10.6.1')
2251         self.report_extraction(mobj.group(1))
2252         info = None
2253         try:
2254             urlh = compat_urllib_request.urlopen(request)
2255             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2256                 basename = url.split('/')[-1]
2257                 title,ext = os.path.splitext(basename)
2258                 title = title.decode('UTF-8')
2259                 ext = ext.replace('.', '')
2260                 self.report_direct_download(title)
2261                 info = {
2262                     'id': title,
2263                     'url': url,
2264                     'uploader': None,
2265                     'upload_date': None,
2266                     'title': title,
2267                     'ext': ext,
2268                     'urlhandle': urlh
2269                 }
2270         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2271             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2272         if info is None: # Regular URL
2273             try:
2274                 json_code_bytes = urlh.read()
2275                 json_code = json_code_bytes.decode('utf-8')
2276             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2277                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2278                 return
2279
2280             try:
2281                 json_data = json.loads(json_code)
2282                 if 'Post' in json_data:
2283                     data = json_data['Post']
2284                 else:
2285                     data = json_data
2286
2287                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2288                 video_url = data['media']['url']
2289                 umobj = re.match(self._URL_EXT, video_url)
2290                 if umobj is None:
2291                     raise ValueError('Can not determine filename extension')
2292                 ext = umobj.group(1)
2293
2294                 info = {
2295                     'id': data['item_id'],
2296                     'url': video_url,
2297                     'uploader': data['display_name'],
2298                     'upload_date': upload_date,
2299                     'title': data['title'],
2300                     'ext': ext,
2301                     'format': data['media']['mimeType'],
2302                     'thumbnail': data['thumbnailUrl'],
2303                     'description': data['description'],
2304                     'player_url': data['embedUrl'],
2305                     'user_agent': 'iTunes/10.6.1',
2306                 }
2307             except (ValueError,KeyError) as err:
2308                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2309                 return
2310
2311         return [info]
2312
2313
2314 class MyVideoIE(InfoExtractor):
2315     """Information Extractor for myvideo.de."""
2316
2317     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2318     IE_NAME = u'myvideo'
2319
2320     def __init__(self, downloader=None):
2321         InfoExtractor.__init__(self, downloader)
2322
2323     def report_extraction(self, video_id):
2324         """Report information extraction."""
2325         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2326
2327     def _real_extract(self,url):
2328         mobj = re.match(self._VALID_URL, url)
2329         if mobj is None:
2330             self._download.report_error(u'invalid URL: %s' % url)
2331             return
2332
2333         video_id = mobj.group(1)
2334
2335         # Get video webpage
2336         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2337         webpage = self._download_webpage(webpage_url, video_id)
2338
2339         self.report_extraction(video_id)
2340         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2341                  webpage)
2342         if mobj is None:
2343             self._downloader.report_error(u'unable to extract media URL')
2344             return
2345         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2346
2347         mobj = re.search('<title>([^<]+)</title>', webpage)
2348         if mobj is None:
2349             self._downloader.report_error(u'unable to extract title')
2350             return
2351
2352         video_title = mobj.group(1)
2353
2354         return [{
2355             'id':       video_id,
2356             'url':      video_url,
2357             'uploader': None,
2358             'upload_date':  None,
2359             'title':    video_title,
2360             'ext':      u'flv',
2361         }]
2362
2363 class ComedyCentralIE(InfoExtractor):
2364     """Information extractor for The Daily Show and Colbert Report """
2365
2366     # urls can be abbreviations like :thedailyshow or :colbert
2367     # urls for episodes like:
2368     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2369     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2370     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2371     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2372                       |(https?://)?(www\.)?
2373                           (?P<showname>thedailyshow|colbertnation)\.com/
2374                          (full-episodes/(?P<episode>.*)|
2375                           (?P<clip>
2376                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2377                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2378                      $"""
2379
2380     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2381
2382     _video_extensions = {
2383         '3500': 'mp4',
2384         '2200': 'mp4',
2385         '1700': 'mp4',
2386         '1200': 'mp4',
2387         '750': 'mp4',
2388         '400': 'mp4',
2389     }
2390     _video_dimensions = {
2391         '3500': '1280x720',
2392         '2200': '960x540',
2393         '1700': '768x432',
2394         '1200': '640x360',
2395         '750': '512x288',
2396         '400': '384x216',
2397     }
2398
2399     @classmethod
2400     def suitable(cls, url):
2401         """Receives a URL and returns True if suitable for this IE."""
2402         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2403
2404     def report_extraction(self, episode_id):
2405         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2406
2407     def report_config_download(self, episode_id, media_id):
2408         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2409
2410     def report_index_download(self, episode_id):
2411         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2412
2413     def _print_formats(self, formats):
2414         print('Available formats:')
2415         for x in formats:
2416             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2417
2418
2419     def _real_extract(self, url):
2420         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2421         if mobj is None:
2422             self._downloader.report_error(u'invalid URL: %s' % url)
2423             return
2424
2425         if mobj.group('shortname'):
2426             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2427                 url = u'http://www.thedailyshow.com/full-episodes/'
2428             else:
2429                 url = u'http://www.colbertnation.com/full-episodes/'
2430             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2431             assert mobj is not None
2432
2433         if mobj.group('clip'):
2434             if mobj.group('showname') == 'thedailyshow':
2435                 epTitle = mobj.group('tdstitle')
2436             else:
2437                 epTitle = mobj.group('cntitle')
2438             dlNewest = False
2439         else:
2440             dlNewest = not mobj.group('episode')
2441             if dlNewest:
2442                 epTitle = mobj.group('showname')
2443             else:
2444                 epTitle = mobj.group('episode')
2445
2446         req = compat_urllib_request.Request(url)
2447         self.report_extraction(epTitle)
2448         try:
2449             htmlHandle = compat_urllib_request.urlopen(req)
2450             html = htmlHandle.read()
2451             webpage = html.decode('utf-8')
2452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2454             return
2455         if dlNewest:
2456             url = htmlHandle.geturl()
2457             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2458             if mobj is None:
2459                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2460                 return
2461             if mobj.group('episode') == '':
2462                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2463                 return
2464             epTitle = mobj.group('episode')
2465
2466         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2467
2468         if len(mMovieParams) == 0:
2469             # The Colbert Report embeds the information in a without
2470             # a URL prefix; so extract the alternate reference
2471             # and then add the URL prefix manually.
2472
2473             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2474             if len(altMovieParams) == 0:
2475                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2476                 return
2477             else:
2478                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2479
2480         uri = mMovieParams[0][1]
2481         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2482         self.report_index_download(epTitle)
2483         try:
2484             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2485         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2487             return
2488
2489         results = []
2490
2491         idoc = xml.etree.ElementTree.fromstring(indexXml)
2492         itemEls = idoc.findall('.//item')
2493         for partNum,itemEl in enumerate(itemEls):
2494             mediaId = itemEl.findall('./guid')[0].text
2495             shortMediaId = mediaId.split(':')[-1]
2496             showId = mediaId.split(':')[-2].replace('.com', '')
2497             officialTitle = itemEl.findall('./title')[0].text
2498             officialDate = itemEl.findall('./pubDate')[0].text
2499
2500             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2501                         compat_urllib_parse.urlencode({'uri': mediaId}))
2502             configReq = compat_urllib_request.Request(configUrl)
2503             self.report_config_download(epTitle, shortMediaId)
2504             try:
2505                 configXml = compat_urllib_request.urlopen(configReq).read()
2506             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2507                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2508                 return
2509
2510             cdoc = xml.etree.ElementTree.fromstring(configXml)
2511             turls = []
2512             for rendition in cdoc.findall('.//rendition'):
2513                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2514                 turls.append(finfo)
2515
2516             if len(turls) == 0:
2517                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2518                 continue
2519
2520             if self._downloader.params.get('listformats', None):
2521                 self._print_formats([i[0] for i in turls])
2522                 return
2523
2524             # For now, just pick the highest bitrate
2525             format,rtmp_video_url = turls[-1]
2526
2527             # Get the format arg from the arg stream
2528             req_format = self._downloader.params.get('format', None)
2529
2530             # Select format if we can find one
2531             for f,v in turls:
2532                 if f == req_format:
2533                     format, rtmp_video_url = f, v
2534                     break
2535
2536             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2537             if not m:
2538                 raise ExtractorError(u'Cannot transform RTMP url')
2539             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2540             video_url = base + m.group('finalid')
2541
2542             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2543             info = {
2544                 'id': shortMediaId,
2545                 'url': video_url,
2546                 'uploader': showId,
2547                 'upload_date': officialDate,
2548                 'title': effTitle,
2549                 'ext': 'mp4',
2550                 'format': format,
2551                 'thumbnail': None,
2552                 'description': officialTitle,
2553             }
2554             results.append(info)
2555
2556         return results
2557
2558
2559 class EscapistIE(InfoExtractor):
2560     """Information extractor for The Escapist """
2561
2562     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2563     IE_NAME = u'escapist'
2564
2565     def report_extraction(self, showName):
2566         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2567
2568     def report_config_download(self, showName):
2569         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2570
2571     def _real_extract(self, url):
2572         mobj = re.match(self._VALID_URL, url)
2573         if mobj is None:
2574             self._downloader.report_error(u'invalid URL: %s' % url)
2575             return
2576         showName = mobj.group('showname')
2577         videoId = mobj.group('episode')
2578
2579         self.report_extraction(showName)
2580         try:
2581             webPage = compat_urllib_request.urlopen(url)
2582             webPageBytes = webPage.read()
2583             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2584             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2587             return
2588
2589         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2590         description = unescapeHTML(descMatch.group(1))
2591         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2592         imgUrl = unescapeHTML(imgMatch.group(1))
2593         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2594         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2595         configUrlMatch = re.search('config=(.*)$', playerUrl)
2596         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2597
2598         self.report_config_download(showName)
2599         try:
2600             configJSON = compat_urllib_request.urlopen(configUrl)
2601             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2602             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2603         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2604             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2605             return
2606
2607         # Technically, it's JavaScript, not JSON
2608         configJSON = configJSON.replace("'", '"')
2609
2610         try:
2611             config = json.loads(configJSON)
2612         except (ValueError,) as err:
2613             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2614             return
2615
2616         playlist = config['playlist']
2617         videoUrl = playlist[1]['url']
2618
2619         info = {
2620             'id': videoId,
2621             'url': videoUrl,
2622             'uploader': showName,
2623             'upload_date': None,
2624             'title': showName,
2625             'ext': 'mp4',
2626             'thumbnail': imgUrl,
2627             'description': description,
2628             'player_url': playerUrl,
2629         }
2630
2631         return [info]
2632
2633 class CollegeHumorIE(InfoExtractor):
2634     """Information extractor for collegehumor.com"""
2635
2636     _WORKING = False
2637     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2638     IE_NAME = u'collegehumor'
2639
2640     def report_manifest(self, video_id):
2641         """Report information extraction."""
2642         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2643
2644     def report_extraction(self, video_id):
2645         """Report information extraction."""
2646         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2647
2648     def _real_extract(self, url):
2649         mobj = re.match(self._VALID_URL, url)
2650         if mobj is None:
2651             self._downloader.report_error(u'invalid URL: %s' % url)
2652             return
2653         video_id = mobj.group('videoid')
2654
2655         info = {
2656             'id': video_id,
2657             'uploader': None,
2658             'upload_date': None,
2659         }
2660
2661         self.report_extraction(video_id)
2662         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2663         try:
2664             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2665         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2666             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2667             return
2668
2669         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2670         try:
2671             videoNode = mdoc.findall('./video')[0]
2672             info['description'] = videoNode.findall('./description')[0].text
2673             info['title'] = videoNode.findall('./caption')[0].text
2674             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2675             manifest_url = videoNode.findall('./file')[0].text
2676         except IndexError:
2677             self._downloader.report_error(u'Invalid metadata XML file')
2678             return
2679
2680         manifest_url += '?hdcore=2.10.3'
2681         self.report_manifest(video_id)
2682         try:
2683             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2684         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2685             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2686             return
2687
2688         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2689         try:
2690             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2691             node_id = media_node.attrib['url']
2692             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2693         except IndexError as err:
2694             self._downloader.report_error(u'Invalid manifest file')
2695             return
2696
2697         url_pr = compat_urllib_parse_urlparse(manifest_url)
2698         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2699
2700         info['url'] = url
2701         info['ext'] = 'f4f'
2702         return [info]
2703
2704
2705 class XVideosIE(InfoExtractor):
2706     """Information extractor for xvideos.com"""
2707
2708     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2709     IE_NAME = u'xvideos'
2710
2711     def report_extraction(self, video_id):
2712         """Report information extraction."""
2713         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2714
2715     def _real_extract(self, url):
2716         mobj = re.match(self._VALID_URL, url)
2717         if mobj is None:
2718             self._downloader.report_error(u'invalid URL: %s' % url)
2719             return
2720         video_id = mobj.group(1)
2721
2722         webpage = self._download_webpage(url, video_id)
2723
2724         self.report_extraction(video_id)
2725
2726
2727         # Extract video URL
2728         mobj = re.search(r'flv_url=(.+?)&', webpage)
2729         if mobj is None:
2730             self._downloader.report_error(u'unable to extract video url')
2731             return
2732         video_url = compat_urllib_parse.unquote(mobj.group(1))
2733
2734
2735         # Extract title
2736         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2737         if mobj is None:
2738             self._downloader.report_error(u'unable to extract video title')
2739             return
2740         video_title = mobj.group(1)
2741
2742
2743         # Extract video thumbnail
2744         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2745         if mobj is None:
2746             self._downloader.report_error(u'unable to extract video thumbnail')
2747             return
2748         video_thumbnail = mobj.group(0)
2749
2750         info = {
2751             'id': video_id,
2752             'url': video_url,
2753             'uploader': None,
2754             'upload_date': None,
2755             'title': video_title,
2756             'ext': 'flv',
2757             'thumbnail': video_thumbnail,
2758             'description': None,
2759         }
2760
2761         return [info]
2762
2763
2764 class SoundcloudIE(InfoExtractor):
2765     """Information extractor for soundcloud.com
2766        To access the media, the uid of the song and a stream token
2767        must be extracted from the page source and the script must make
2768        a request to media.soundcloud.com/crossdomain.xml. Then
2769        the media can be grabbed by requesting from an url composed
2770        of the stream token and uid
2771      """
2772
2773     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2774     IE_NAME = u'soundcloud'
2775
2776     def __init__(self, downloader=None):
2777         InfoExtractor.__init__(self, downloader)
2778
2779     def report_resolve(self, video_id):
2780         """Report information extraction."""
2781         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2782
2783     def report_extraction(self, video_id):
2784         """Report information extraction."""
2785         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2786
2787     def _real_extract(self, url):
2788         mobj = re.match(self._VALID_URL, url)
2789         if mobj is None:
2790             self._downloader.report_error(u'invalid URL: %s' % url)
2791             return
2792
2793         # extract uploader (which is in the url)
2794         uploader = mobj.group(1)
2795         # extract simple title (uploader + slug of song title)
2796         slug_title =  mobj.group(2)
2797         simple_title = uploader + u'-' + slug_title
2798
2799         self.report_resolve('%s/%s' % (uploader, slug_title))
2800
2801         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2802         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2803         request = compat_urllib_request.Request(resolv_url)
2804         try:
2805             info_json_bytes = compat_urllib_request.urlopen(request).read()
2806             info_json = info_json_bytes.decode('utf-8')
2807         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2808             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2809             return
2810
2811         info = json.loads(info_json)
2812         video_id = info['id']
2813         self.report_extraction('%s/%s' % (uploader, slug_title))
2814
2815         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2816         request = compat_urllib_request.Request(streams_url)
2817         try:
2818             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2819             stream_json = stream_json_bytes.decode('utf-8')
2820         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2821             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2822             return
2823
2824         streams = json.loads(stream_json)
2825         mediaURL = streams['http_mp3_128_url']
2826
2827         return [{
2828             'id':       info['id'],
2829             'url':      mediaURL,
2830             'uploader': info['user']['username'],
2831             'upload_date':  info['created_at'],
2832             'title':    info['title'],
2833             'ext':      u'mp3',
2834             'description': info['description'],
2835         }]
2836
2837 class SoundcloudSetIE(InfoExtractor):
2838     """Information extractor for soundcloud.com sets
2839        To access the media, the uid of the song and a stream token
2840        must be extracted from the page source and the script must make
2841        a request to media.soundcloud.com/crossdomain.xml. Then
2842        the media can be grabbed by requesting from an url composed
2843        of the stream token and uid
2844      """
2845
2846     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2847     IE_NAME = u'soundcloud'
2848
2849     def __init__(self, downloader=None):
2850         InfoExtractor.__init__(self, downloader)
2851
2852     def report_resolve(self, video_id):
2853         """Report information extraction."""
2854         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2855
2856     def report_extraction(self, video_id):
2857         """Report information extraction."""
2858         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2859
2860     def _real_extract(self, url):
2861         mobj = re.match(self._VALID_URL, url)
2862         if mobj is None:
2863             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2864             return
2865
2866         # extract uploader (which is in the url)
2867         uploader = mobj.group(1)
2868         # extract simple title (uploader + slug of song title)
2869         slug_title =  mobj.group(2)
2870         simple_title = uploader + u'-' + slug_title
2871
2872         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2873
2874         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2875         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2876         request = compat_urllib_request.Request(resolv_url)
2877         try:
2878             info_json_bytes = compat_urllib_request.urlopen(request).read()
2879             info_json = info_json_bytes.decode('utf-8')
2880         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2881             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2882             return
2883
2884         videos = []
2885         info = json.loads(info_json)
2886         if 'errors' in info:
2887             for err in info['errors']:
2888                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2889             return
2890
2891         for track in info['tracks']:
2892             video_id = track['id']
2893             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2894
2895             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2896             request = compat_urllib_request.Request(streams_url)
2897             try:
2898                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2899                 stream_json = stream_json_bytes.decode('utf-8')
2900             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2901                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2902                 return
2903
2904             streams = json.loads(stream_json)
2905             mediaURL = streams['http_mp3_128_url']
2906
2907             videos.append({
2908                 'id':       video_id,
2909                 'url':      mediaURL,
2910                 'uploader': track['user']['username'],
2911                 'upload_date':  track['created_at'],
2912                 'title':    track['title'],
2913                 'ext':      u'mp3',
2914                 'description': track['description'],
2915             })
2916         return videos
2917
2918
2919 class InfoQIE(InfoExtractor):
2920     """Information extractor for infoq.com"""
2921     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2922
2923     def report_extraction(self, video_id):
2924         """Report information extraction."""
2925         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2926
2927     def _real_extract(self, url):
2928         mobj = re.match(self._VALID_URL, url)
2929         if mobj is None:
2930             self._downloader.report_error(u'invalid URL: %s' % url)
2931             return
2932
2933         webpage = self._download_webpage(url, video_id=url)
2934         self.report_extraction(url)
2935
2936         # Extract video URL
2937         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2938         if mobj is None:
2939             self._downloader.report_error(u'unable to extract video url')
2940             return
2941         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2942         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2943
2944         # Extract title
2945         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2946         if mobj is None:
2947             self._downloader.report_error(u'unable to extract video title')
2948             return
2949         video_title = mobj.group(1)
2950
2951         # Extract description
2952         video_description = u'No description available.'
2953         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2954         if mobj is not None:
2955             video_description = mobj.group(1)
2956
2957         video_filename = video_url.split('/')[-1]
2958         video_id, extension = video_filename.split('.')
2959
2960         info = {
2961             'id': video_id,
2962             'url': video_url,
2963             'uploader': None,
2964             'upload_date': None,
2965             'title': video_title,
2966             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2967             'thumbnail': None,
2968             'description': video_description,
2969         }
2970
2971         return [info]
2972
2973 class MixcloudIE(InfoExtractor):
2974     """Information extractor for www.mixcloud.com"""
2975
2976     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2977     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2978     IE_NAME = u'mixcloud'
2979
2980     def __init__(self, downloader=None):
2981         InfoExtractor.__init__(self, downloader)
2982
2983     def report_download_json(self, file_id):
2984         """Report JSON download."""
2985         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2986
2987     def report_extraction(self, file_id):
2988         """Report information extraction."""
2989         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2990
2991     def get_urls(self, jsonData, fmt, bitrate='best'):
2992         """Get urls from 'audio_formats' section in json"""
2993         file_url = None
2994         try:
2995             bitrate_list = jsonData[fmt]
2996             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2997                 bitrate = max(bitrate_list) # select highest
2998
2999             url_list = jsonData[fmt][bitrate]
3000         except TypeError: # we have no bitrate info.
3001             url_list = jsonData[fmt]
3002         return url_list
3003
3004     def check_urls(self, url_list):
3005         """Returns 1st active url from list"""
3006         for url in url_list:
3007             try:
3008                 compat_urllib_request.urlopen(url)
3009                 return url
3010             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3011                 url = None
3012
3013         return None
3014
3015     def _print_formats(self, formats):
3016         print('Available formats:')
3017         for fmt in formats.keys():
3018             for b in formats[fmt]:
3019                 try:
3020                     ext = formats[fmt][b][0]
3021                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3022                 except TypeError: # we have no bitrate info
3023                     ext = formats[fmt][0]
3024                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3025                     break
3026
3027     def _real_extract(self, url):
3028         mobj = re.match(self._VALID_URL, url)
3029         if mobj is None:
3030             self._downloader.report_error(u'invalid URL: %s' % url)
3031             return
3032         # extract uploader & filename from url
3033         uploader = mobj.group(1).decode('utf-8')
3034         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3035
3036         # construct API request
3037         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3038         # retrieve .json file with links to files
3039         request = compat_urllib_request.Request(file_url)
3040         try:
3041             self.report_download_json(file_url)
3042             jsonData = compat_urllib_request.urlopen(request).read()
3043         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3044             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3045             return
3046
3047         # parse JSON
3048         json_data = json.loads(jsonData)
3049         player_url = json_data['player_swf_url']
3050         formats = dict(json_data['audio_formats'])
3051
3052         req_format = self._downloader.params.get('format', None)
3053         bitrate = None
3054
3055         if self._downloader.params.get('listformats', None):
3056             self._print_formats(formats)
3057             return
3058
3059         if req_format is None or req_format == 'best':
3060             for format_param in formats.keys():
3061                 url_list = self.get_urls(formats, format_param)
3062                 # check urls
3063                 file_url = self.check_urls(url_list)
3064                 if file_url is not None:
3065                     break # got it!
3066         else:
3067             if req_format not in formats:
3068                 self._downloader.report_error(u'format is not available')
3069                 return
3070
3071             url_list = self.get_urls(formats, req_format)
3072             file_url = self.check_urls(url_list)
3073             format_param = req_format
3074
3075         return [{
3076             'id': file_id.decode('utf-8'),
3077             'url': file_url.decode('utf-8'),
3078             'uploader': uploader.decode('utf-8'),
3079             'upload_date': None,
3080             'title': json_data['name'],
3081             'ext': file_url.split('.')[-1].decode('utf-8'),
3082             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3083             'thumbnail': json_data['thumbnail_url'],
3084             'description': json_data['description'],
3085             'player_url': player_url.decode('utf-8'),
3086         }]
3087
3088 class StanfordOpenClassroomIE(InfoExtractor):
3089     """Information extractor for Stanford's Open ClassRoom"""
3090
3091     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3092     IE_NAME = u'stanfordoc'
3093
3094     def report_download_webpage(self, objid):
3095         """Report information extraction."""
3096         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3097
3098     def report_extraction(self, video_id):
3099         """Report information extraction."""
3100         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3101
3102     def _real_extract(self, url):
3103         mobj = re.match(self._VALID_URL, url)
3104         if mobj is None:
3105             raise ExtractorError(u'Invalid URL: %s' % url)
3106
3107         if mobj.group('course') and mobj.group('video'): # A specific video
3108             course = mobj.group('course')
3109             video = mobj.group('video')
3110             info = {
3111                 'id': course + '_' + video,
3112                 'uploader': None,
3113                 'upload_date': None,
3114             }
3115
3116             self.report_extraction(info['id'])
3117             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3118             xmlUrl = baseUrl + video + '.xml'
3119             try:
3120                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3121             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3122                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3123                 return
3124             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3125             try:
3126                 info['title'] = mdoc.findall('./title')[0].text
3127                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3128             except IndexError:
3129                 self._downloader.report_error(u'Invalid metadata XML file')
3130                 return
3131             info['ext'] = info['url'].rpartition('.')[2]
3132             return [info]
3133         elif mobj.group('course'): # A course page
3134             course = mobj.group('course')
3135             info = {
3136                 'id': course,
3137                 'type': 'playlist',
3138                 'uploader': None,
3139                 'upload_date': None,
3140             }
3141
3142             coursepage = self._download_webpage(url, info['id'],
3143                                         note='Downloading course info page',
3144                                         errnote='Unable to download course info page')
3145
3146             m = re.search('<h1>([^<]+)</h1>', coursepage)
3147             if m:
3148                 info['title'] = unescapeHTML(m.group(1))
3149             else:
3150                 info['title'] = info['id']
3151
3152             m = re.search('<description>([^<]+)</description>', coursepage)
3153             if m:
3154                 info['description'] = unescapeHTML(m.group(1))
3155
3156             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3157             info['list'] = [
3158                 {
3159                     'type': 'reference',
3160                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3161                 }
3162                     for vpage in links]
3163             results = []
3164             for entry in info['list']:
3165                 assert entry['type'] == 'reference'
3166                 results += self.extract(entry['url'])
3167             return results
3168         else: # Root page
3169             info = {
3170                 'id': 'Stanford OpenClassroom',
3171                 'type': 'playlist',
3172                 'uploader': None,
3173                 'upload_date': None,
3174             }
3175
3176             self.report_download_webpage(info['id'])
3177             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3178             try:
3179                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3180             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3181                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3182                 return
3183
3184             info['title'] = info['id']
3185
3186             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3187             info['list'] = [
3188                 {
3189                     'type': 'reference',
3190                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3191                 }
3192                     for cpage in links]
3193
3194             results = []
3195             for entry in info['list']:
3196                 assert entry['type'] == 'reference'
3197                 results += self.extract(entry['url'])
3198             return results
3199
3200 class MTVIE(InfoExtractor):
3201     """Information extractor for MTV.com"""
3202
3203     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3204     IE_NAME = u'mtv'
3205
3206     def report_extraction(self, video_id):
3207         """Report information extraction."""
3208         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3209
3210     def _real_extract(self, url):
3211         mobj = re.match(self._VALID_URL, url)
3212         if mobj is None:
3213             self._downloader.report_error(u'invalid URL: %s' % url)
3214             return
3215         if not mobj.group('proto'):
3216             url = 'http://' + url
3217         video_id = mobj.group('videoid')
3218
3219         webpage = self._download_webpage(url, video_id)
3220
3221         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3222         if mobj is None:
3223             self._downloader.report_error(u'unable to extract song name')
3224             return
3225         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3226         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3227         if mobj is None:
3228             self._downloader.report_error(u'unable to extract performer')
3229             return
3230         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3231         video_title = performer + ' - ' + song_name
3232
3233         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3234         if mobj is None:
3235             self._downloader.report_error(u'unable to mtvn_uri')
3236             return
3237         mtvn_uri = mobj.group(1)
3238
3239         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3240         if mobj is None:
3241             self._downloader.report_error(u'unable to extract content id')
3242             return
3243         content_id = mobj.group(1)
3244
3245         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3246         self.report_extraction(video_id)
3247         request = compat_urllib_request.Request(videogen_url)
3248         try:
3249             metadataXml = compat_urllib_request.urlopen(request).read()
3250         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3251             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3252             return
3253
3254         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3255         renditions = mdoc.findall('.//rendition')
3256
3257         # For now, always pick the highest quality.
3258         rendition = renditions[-1]
3259
3260         try:
3261             _,_,ext = rendition.attrib['type'].partition('/')
3262             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3263             video_url = rendition.find('./src').text
3264         except KeyError:
3265             self._downloader.trouble('Invalid rendition field.')
3266             return
3267
3268         info = {
3269             'id': video_id,
3270             'url': video_url,
3271             'uploader': performer,
3272             'upload_date': None,
3273             'title': video_title,
3274             'ext': ext,
3275             'format': format,
3276         }
3277
3278         return [info]
3279
3280
3281 class YoukuIE(InfoExtractor):
3282     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3283
3284     def report_download_webpage(self, file_id):
3285         """Report webpage download."""
3286         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3287
3288     def report_extraction(self, file_id):
3289         """Report information extraction."""
3290         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3291
3292     def _gen_sid(self):
3293         nowTime = int(time.time() * 1000)
3294         random1 = random.randint(1000,1998)
3295         random2 = random.randint(1000,9999)
3296
3297         return "%d%d%d" %(nowTime,random1,random2)
3298
3299     def _get_file_ID_mix_string(self, seed):
3300         mixed = []
3301         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3302         seed = float(seed)
3303         for i in range(len(source)):
3304             seed  =  (seed * 211 + 30031 ) % 65536
3305             index  =  math.floor(seed / 65536 * len(source) )
3306             mixed.append(source[int(index)])
3307             source.remove(source[int(index)])
3308         #return ''.join(mixed)
3309         return mixed
3310
3311     def _get_file_id(self, fileId, seed):
3312         mixed = self._get_file_ID_mix_string(seed)
3313         ids = fileId.split('*')
3314         realId = []
3315         for ch in ids:
3316             if ch:
3317                 realId.append(mixed[int(ch)])
3318         return ''.join(realId)
3319
3320     def _real_extract(self, url):
3321         mobj = re.match(self._VALID_URL, url)
3322         if mobj is None:
3323             self._downloader.report_error(u'invalid URL: %s' % url)
3324             return
3325         video_id = mobj.group('ID')
3326
3327         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3328
3329         request = compat_urllib_request.Request(info_url, None, std_headers)
3330         try:
3331             self.report_download_webpage(video_id)
3332             jsondata = compat_urllib_request.urlopen(request).read()
3333         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3334             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3335             return
3336
3337         self.report_extraction(video_id)
3338         try:
3339             jsonstr = jsondata.decode('utf-8')
3340             config = json.loads(jsonstr)
3341
3342             video_title =  config['data'][0]['title']
3343             seed = config['data'][0]['seed']
3344
3345             format = self._downloader.params.get('format', None)
3346             supported_format = list(config['data'][0]['streamfileids'].keys())
3347
3348             if format is None or format == 'best':
3349                 if 'hd2' in supported_format:
3350                     format = 'hd2'
3351                 else:
3352                     format = 'flv'
3353                 ext = u'flv'
3354             elif format == 'worst':
3355                 format = 'mp4'
3356                 ext = u'mp4'
3357             else:
3358                 format = 'flv'
3359                 ext = u'flv'
3360
3361
3362             fileid = config['data'][0]['streamfileids'][format]
3363             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3364         except (UnicodeDecodeError, ValueError, KeyError):
3365             self._downloader.report_error(u'unable to extract info section')
3366             return
3367
3368         files_info=[]
3369         sid = self._gen_sid()
3370         fileid = self._get_file_id(fileid, seed)
3371
3372         #column 8,9 of fileid represent the segment number
3373         #fileid[7:9] should be changed
3374         for index, key in enumerate(keys):
3375
3376             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3377             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3378
3379             info = {
3380                 'id': '%s_part%02d' % (video_id, index),
3381                 'url': download_url,
3382                 'uploader': None,
3383                 'upload_date': None,
3384                 'title': video_title,
3385                 'ext': ext,
3386             }
3387             files_info.append(info)
3388
3389         return files_info
3390
3391
3392 class XNXXIE(InfoExtractor):
3393     """Information extractor for xnxx.com"""
3394
3395     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3396     IE_NAME = u'xnxx'
3397     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3398     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3399     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3400
3401     def report_webpage(self, video_id):
3402         """Report information extraction"""
3403         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3404
3405     def report_extraction(self, video_id):
3406         """Report information extraction"""
3407         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3408
3409     def _real_extract(self, url):
3410         mobj = re.match(self._VALID_URL, url)
3411         if mobj is None:
3412             self._downloader.report_error(u'invalid URL: %s' % url)
3413             return
3414         video_id = mobj.group(1)
3415
3416         self.report_webpage(video_id)
3417
3418         # Get webpage content
3419         try:
3420             webpage_bytes = compat_urllib_request.urlopen(url).read()
3421             webpage = webpage_bytes.decode('utf-8')
3422         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3423             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3424             return
3425
3426         result = re.search(self.VIDEO_URL_RE, webpage)
3427         if result is None:
3428             self._downloader.report_error(u'unable to extract video url')
3429             return
3430         video_url = compat_urllib_parse.unquote(result.group(1))
3431
3432         result = re.search(self.VIDEO_TITLE_RE, webpage)
3433         if result is None:
3434             self._downloader.report_error(u'unable to extract video title')
3435             return
3436         video_title = result.group(1)
3437
3438         result = re.search(self.VIDEO_THUMB_RE, webpage)
3439         if result is None:
3440             self._downloader.report_error(u'unable to extract video thumbnail')
3441             return
3442         video_thumbnail = result.group(1)
3443
3444         return [{
3445             'id': video_id,
3446             'url': video_url,
3447             'uploader': None,
3448             'upload_date': None,
3449             'title': video_title,
3450             'ext': 'flv',
3451             'thumbnail': video_thumbnail,
3452             'description': None,
3453         }]
3454
3455
3456 class GooglePlusIE(InfoExtractor):
3457     """Information extractor for plus.google.com."""
3458
3459     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3460     IE_NAME = u'plus.google'
3461
3462     def __init__(self, downloader=None):
3463         InfoExtractor.__init__(self, downloader)
3464
3465     def report_extract_entry(self, url):
3466         """Report downloading extry"""
3467         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3468
3469     def report_date(self, upload_date):
3470         """Report downloading extry"""
3471         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3472
3473     def report_uploader(self, uploader):
3474         """Report downloading extry"""
3475         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3476
3477     def report_title(self, video_title):
3478         """Report downloading extry"""
3479         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3480
3481     def report_extract_vid_page(self, video_page):
3482         """Report information extraction."""
3483         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3484
3485     def _real_extract(self, url):
3486         # Extract id from URL
3487         mobj = re.match(self._VALID_URL, url)
3488         if mobj is None:
3489             self._downloader.report_error(u'Invalid URL: %s' % url)
3490             return
3491
3492         post_url = mobj.group(0)
3493         video_id = mobj.group(1)
3494
3495         video_extension = 'flv'
3496
3497         # Step 1, Retrieve post webpage to extract further information
3498         self.report_extract_entry(post_url)
3499         request = compat_urllib_request.Request(post_url)
3500         try:
3501             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3502         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3503             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3504             return
3505
3506         # Extract update date
3507         upload_date = None
3508         pattern = 'title="Timestamp">(.*?)</a>'
3509         mobj = re.search(pattern, webpage)
3510         if mobj:
3511             upload_date = mobj.group(1)
3512             # Convert timestring to a format suitable for filename
3513             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3514             upload_date = upload_date.strftime('%Y%m%d')
3515         self.report_date(upload_date)
3516
3517         # Extract uploader
3518         uploader = None
3519         pattern = r'rel\="author".*?>(.*?)</a>'
3520         mobj = re.search(pattern, webpage)
3521         if mobj:
3522             uploader = mobj.group(1)
3523         self.report_uploader(uploader)
3524
3525         # Extract title
3526         # Get the first line for title
3527         video_title = u'NA'
3528         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3529         mobj = re.search(pattern, webpage)
3530         if mobj:
3531             video_title = mobj.group(1)
3532         self.report_title(video_title)
3533
3534         # Step 2, Stimulate clicking the image box to launch video
3535         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3536         mobj = re.search(pattern, webpage)
3537         if mobj is None:
3538             self._downloader.report_error(u'unable to extract video page URL')
3539
3540         video_page = mobj.group(1)
3541         request = compat_urllib_request.Request(video_page)
3542         try:
3543             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3545             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3546             return
3547         self.report_extract_vid_page(video_page)
3548
3549
3550         # Extract video links on video page
3551         """Extract video links of all sizes"""
3552         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3553         mobj = re.findall(pattern, webpage)
3554         if len(mobj) == 0:
3555             self._downloader.report_error(u'unable to extract video links')
3556
3557         # Sort in resolution
3558         links = sorted(mobj)
3559
3560         # Choose the lowest of the sort, i.e. highest resolution
3561         video_url = links[-1]
3562         # Only get the url. The resolution part in the tuple has no use anymore
3563         video_url = video_url[-1]
3564         # Treat escaped \u0026 style hex
3565         try:
3566             video_url = video_url.decode("unicode_escape")
3567         except AttributeError: # Python 3
3568             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3569
3570
3571         return [{
3572             'id':       video_id,
3573             'url':      video_url,
3574             'uploader': uploader,
3575             'upload_date':  upload_date,
3576             'title':    video_title,
3577             'ext':      video_extension,
3578         }]
3579
3580 class NBAIE(InfoExtractor):
3581     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3582     IE_NAME = u'nba'
3583
3584     def _real_extract(self, url):
3585         mobj = re.match(self._VALID_URL, url)
3586         if mobj is None:
3587             self._downloader.report_error(u'invalid URL: %s' % url)
3588             return
3589
3590         video_id = mobj.group(1)
3591         if video_id.endswith('/index.html'):
3592             video_id = video_id[:-len('/index.html')]
3593
3594         webpage = self._download_webpage(url, video_id)
3595
3596         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3597         def _findProp(rexp, default=None):
3598             m = re.search(rexp, webpage)
3599             if m:
3600                 return unescapeHTML(m.group(1))
3601             else:
3602                 return default
3603
3604         shortened_video_id = video_id.rpartition('/')[2]
3605         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3606         info = {
3607             'id': shortened_video_id,
3608             'url': video_url,
3609             'ext': 'mp4',
3610             'title': title,
3611             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3612             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3613         }
3614         return [info]
3615
3616 class JustinTVIE(InfoExtractor):
3617     """Information extractor for justin.tv and twitch.tv"""
3618     # TODO: One broadcast may be split into multiple videos. The key
3619     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3620     # starts at 1 and increases. Can we treat all parts as one video?
3621
3622     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3623         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3624     _JUSTIN_PAGE_LIMIT = 100
3625     IE_NAME = u'justin.tv'
3626
3627     def report_extraction(self, file_id):
3628         """Report information extraction."""
3629         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3630
3631     def report_download_page(self, channel, offset):
3632         """Report attempt to download a single page of videos."""
3633         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3634                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3635
3636     # Return count of items, list of *valid* items
3637     def _parse_page(self, url):
3638         try:
3639             urlh = compat_urllib_request.urlopen(url)
3640             webpage_bytes = urlh.read()
3641             webpage = webpage_bytes.decode('utf-8', 'ignore')
3642         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3643             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3644             return
3645
3646         response = json.loads(webpage)
3647         if type(response) != list:
3648             error_text = response.get('error', 'unknown error')
3649             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3650             return
3651         info = []
3652         for clip in response:
3653             video_url = clip['video_file_url']
3654             if video_url:
3655                 video_extension = os.path.splitext(video_url)[1][1:]
3656                 video_date = re.sub('-', '', clip['start_time'][:10])
3657                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3658                 video_id = clip['id']
3659                 video_title = clip.get('title', video_id)
3660                 info.append({
3661                     'id': video_id,
3662                     'url': video_url,
3663                     'title': video_title,
3664                     'uploader': clip.get('channel_name', video_uploader_id),
3665                     'uploader_id': video_uploader_id,
3666                     'upload_date': video_date,
3667                     'ext': video_extension,
3668                 })
3669         return (len(response), info)
3670
3671     def _real_extract(self, url):
3672         mobj = re.match(self._VALID_URL, url)
3673         if mobj is None:
3674             self._downloader.report_error(u'invalid URL: %s' % url)
3675             return
3676
3677         api = 'http://api.justin.tv'
3678         video_id = mobj.group(mobj.lastindex)
3679         paged = False
3680         if mobj.lastindex == 1:
3681             paged = True
3682             api += '/channel/archives/%s.json'
3683         else:
3684             api += '/broadcast/by_archive/%s.json'
3685         api = api % (video_id,)
3686
3687         self.report_extraction(video_id)
3688
3689         info = []
3690         offset = 0
3691         limit = self._JUSTIN_PAGE_LIMIT
3692         while True:
3693             if paged:
3694                 self.report_download_page(video_id, offset)
3695             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3696             page_count, page_info = self._parse_page(page_url)
3697             info.extend(page_info)
3698             if not paged or page_count != limit:
3699                 break
3700             offset += limit
3701         return info
3702
3703 class FunnyOrDieIE(InfoExtractor):
3704     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3705
3706     def _real_extract(self, url):
3707         mobj = re.match(self._VALID_URL, url)
3708         if mobj is None:
3709             self._downloader.report_error(u'invalid URL: %s' % url)
3710             return
3711
3712         video_id = mobj.group('id')
3713         webpage = self._download_webpage(url, video_id)
3714
3715         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3716         if not m:
3717             self._downloader.report_error(u'unable to find video information')
3718         video_url = unescapeHTML(m.group('url'))
3719
3720         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3721         if not m:
3722             self._downloader.trouble(u'Cannot find video title')
3723         title = clean_html(m.group('title'))
3724
3725         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3726         if m:
3727             desc = unescapeHTML(m.group('desc'))
3728         else:
3729             desc = None
3730
3731         info = {
3732             'id': video_id,
3733             'url': video_url,
3734             'ext': 'mp4',
3735             'title': title,
3736             'description': desc,
3737         }
3738         return [info]
3739
3740 class SteamIE(InfoExtractor):
3741     _VALID_URL = r"""http://store.steampowered.com/
3742                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3743                 (?P<gameID>\d+)/?
3744                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3745                 """
3746
3747     @classmethod
3748     def suitable(cls, url):
3749         """Receives a URL and returns True if suitable for this IE."""
3750         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3751
3752     def _real_extract(self, url):
3753         m = re.match(self._VALID_URL, url, re.VERBOSE)
3754         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3755         gameID = m.group('gameID')
3756         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3757         webpage = self._download_webpage(videourl, gameID)
3758         mweb = re.finditer(urlRE, webpage)
3759         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3760         titles = re.finditer(namesRE, webpage)
3761         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3762         thumbs = re.finditer(thumbsRE, webpage)
3763         videos = []
3764         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3765             video_id = vid.group('videoID')
3766             title = vtitle.group('videoName')
3767             video_url = vid.group('videoURL')
3768             video_thumb = thumb.group('thumbnail')
3769             if not video_url:
3770                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3771             info = {
3772                 'id':video_id,
3773                 'url':video_url,
3774                 'ext': 'flv',
3775                 'title': unescapeHTML(title),
3776                 'thumbnail': video_thumb
3777                   }
3778             videos.append(info)
3779         return videos
3780
3781 class UstreamIE(InfoExtractor):
3782     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3783     IE_NAME = u'ustream'
3784
3785     def _real_extract(self, url):
3786         m = re.match(self._VALID_URL, url)
3787         video_id = m.group('videoID')
3788         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3789         webpage = self._download_webpage(url, video_id)
3790         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3791         title = m.group('title')
3792         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3793         uploader = m.group('uploader')
3794         info = {
3795                 'id':video_id,
3796                 'url':video_url,
3797                 'ext': 'flv',
3798                 'title': title,
3799                 'uploader': uploader
3800                   }
3801         return [info]
3802
3803 class WorldStarHipHopIE(InfoExtractor):
3804     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3805     IE_NAME = u'WorldStarHipHop'
3806
3807     def _real_extract(self, url):
3808         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3809
3810         webpage_src = compat_urllib_request.urlopen(url).read()
3811         webpage_src = webpage_src.decode('utf-8')
3812
3813         mobj = re.search(_src_url, webpage_src)
3814
3815         m = re.match(self._VALID_URL, url)
3816         video_id = m.group('id')
3817
3818         if mobj is not None:
3819             video_url = mobj.group()
3820             if 'mp4' in video_url:
3821                 ext = 'mp4'
3822             else:
3823                 ext = 'flv'
3824         else:
3825             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3826             return
3827
3828         _title = r"""<title>(.*)</title>"""
3829
3830         mobj = re.search(_title, webpage_src)
3831
3832         if mobj is not None:
3833             title = mobj.group(1)
3834         else:
3835             title = 'World Start Hip Hop - %s' % time.ctime()
3836
3837         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3838         mobj = re.search(_thumbnail, webpage_src)
3839
3840         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3841         if mobj is not None:
3842             thumbnail = mobj.group(1)
3843         else:
3844             _title = r"""candytitles.*>(.*)</span>"""
3845             mobj = re.search(_title, webpage_src)
3846             if mobj is not None:
3847                 title = mobj.group(1)
3848             thumbnail = None
3849
3850         results = [{
3851                     'id': video_id,
3852                     'url' : video_url,
3853                     'title' : title,
3854                     'thumbnail' : thumbnail,
3855                     'ext' : ext,
3856                     }]
3857         return results
3858
3859 class RBMARadioIE(InfoExtractor):
3860     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3861
3862     def _real_extract(self, url):
3863         m = re.match(self._VALID_URL, url)
3864         video_id = m.group('videoID')
3865
3866         webpage = self._download_webpage(url, video_id)
3867         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3868         if not m:
3869             raise ExtractorError(u'Cannot find metadata')
3870         json_data = m.group(1)
3871
3872         try:
3873             data = json.loads(json_data)
3874         except ValueError as e:
3875             raise ExtractorError(u'Invalid JSON: ' + str(e))
3876
3877         video_url = data['akamai_url'] + '&cbr=256'
3878         url_parts = compat_urllib_parse_urlparse(video_url)
3879         video_ext = url_parts.path.rpartition('.')[2]
3880         info = {
3881                 'id': video_id,
3882                 'url': video_url,
3883                 'ext': video_ext,
3884                 'title': data['title'],
3885                 'description': data.get('teaser_text'),
3886                 'location': data.get('country_of_origin'),
3887                 'uploader': data.get('host', {}).get('name'),
3888                 'uploader_id': data.get('host', {}).get('slug'),
3889                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3890                 'duration': data.get('duration'),
3891         }
3892         return [info]
3893
3894
3895 class YouPornIE(InfoExtractor):
3896     """Information extractor for youporn.com."""
3897     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3898
3899     def _print_formats(self, formats):
3900         """Print all available formats"""
3901         print(u'Available formats:')
3902         print(u'ext\t\tformat')
3903         print(u'---------------------------------')
3904         for format in formats:
3905             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3906
3907     def _specific(self, req_format, formats):
3908         for x in formats:
3909             if(x["format"]==req_format):
3910                 return x
3911         return None
3912
3913     def _real_extract(self, url):
3914         mobj = re.match(self._VALID_URL, url)
3915         if mobj is None:
3916             self._downloader.report_error(u'invalid URL: %s' % url)
3917             return
3918
3919         video_id = mobj.group('videoid')
3920
3921         req = compat_urllib_request.Request(url)
3922         req.add_header('Cookie', 'age_verified=1')
3923         webpage = self._download_webpage(req, video_id)
3924
3925         # Get the video title
3926         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3927         if result is None:
3928             raise ExtractorError(u'Unable to extract video title')
3929         video_title = result.group('title').strip()
3930
3931         # Get the video date
3932         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3933         if result is None:
3934             self._downloader.report_warning(u'unable to extract video date')
3935             upload_date = None
3936         else:
3937             upload_date = result.group('date').strip()
3938
3939         # Get the video uploader
3940         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3941         if result is None:
3942             self._downloader.report_warning(u'unable to extract uploader')
3943             video_uploader = None
3944         else:
3945             video_uploader = result.group('uploader').strip()
3946             video_uploader = clean_html( video_uploader )
3947
3948         # Get all of the formats available
3949         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3950         result = re.search(DOWNLOAD_LIST_RE, webpage)
3951         if result is None:
3952             raise ExtractorError(u'Unable to extract download list')
3953         download_list_html = result.group('download_list').strip()
3954
3955         # Get all of the links from the page
3956         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3957         links = re.findall(LINK_RE, download_list_html)
3958         if(len(links) == 0):
3959             raise ExtractorError(u'ERROR: no known formats available for video')
3960
3961         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3962
3963         formats = []
3964         for link in links:
3965
3966             # A link looks like this:
3967             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3968             # A path looks like this:
3969             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3970             video_url = unescapeHTML( link )
3971             path = compat_urllib_parse_urlparse( video_url ).path
3972             extension = os.path.splitext( path )[1][1:]
3973             format = path.split('/')[4].split('_')[:2]
3974             size = format[0]
3975             bitrate = format[1]
3976             format = "-".join( format )
3977             title = u'%s-%s-%s' % (video_title, size, bitrate)
3978
3979             formats.append({
3980                 'id': video_id,
3981                 'url': video_url,
3982                 'uploader': video_uploader,
3983                 'upload_date': upload_date,
3984                 'title': title,
3985                 'ext': extension,
3986                 'format': format,
3987                 'thumbnail': None,
3988                 'description': None,
3989                 'player_url': None
3990             })
3991
3992         if self._downloader.params.get('listformats', None):
3993             self._print_formats(formats)
3994             return
3995
3996         req_format = self._downloader.params.get('format', None)
3997         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3998
3999         if req_format is None or req_format == 'best':
4000             return [formats[0]]
4001         elif req_format == 'worst':
4002             return [formats[-1]]
4003         elif req_format in ('-1', 'all'):
4004             return formats
4005         else:
4006             format = self._specific( req_format, formats )
4007             if result is None:
4008                 self._downloader.report_error(u'requested format not available')
4009                 return
4010             return [format]
4011
4012
4013
4014 class PornotubeIE(InfoExtractor):
4015     """Information extractor for pornotube.com."""
4016     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4017
4018     def _real_extract(self, url):
4019         mobj = re.match(self._VALID_URL, url)
4020         if mobj is None:
4021             self._downloader.report_error(u'invalid URL: %s' % url)
4022             return
4023
4024         video_id = mobj.group('videoid')
4025         video_title = mobj.group('title')
4026
4027         # Get webpage content
4028         webpage = self._download_webpage(url, video_id)
4029
4030         # Get the video URL
4031         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4032         result = re.search(VIDEO_URL_RE, webpage)
4033         if result is None:
4034             self._downloader.report_error(u'unable to extract video url')
4035             return
4036         video_url = compat_urllib_parse.unquote(result.group('url'))
4037
4038         #Get the uploaded date
4039         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4040         result = re.search(VIDEO_UPLOADED_RE, webpage)
4041         if result is None:
4042             self._downloader.report_error(u'unable to extract video title')
4043             return
4044         upload_date = result.group('date')
4045
4046         info = {'id': video_id,
4047                 'url': video_url,
4048                 'uploader': None,
4049                 'upload_date': upload_date,
4050                 'title': video_title,
4051                 'ext': 'flv',
4052                 'format': 'flv'}
4053
4054         return [info]
4055
4056 class YouJizzIE(InfoExtractor):
4057     """Information extractor for youjizz.com."""
4058     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4059
4060     def _real_extract(self, url):
4061         mobj = re.match(self._VALID_URL, url)
4062         if mobj is None:
4063             self._downloader.report_error(u'invalid URL: %s' % url)
4064             return
4065
4066         video_id = mobj.group('videoid')
4067
4068         # Get webpage content
4069         webpage = self._download_webpage(url, video_id)
4070
4071         # Get the video title
4072         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4073         if result is None:
4074             raise ExtractorError(u'ERROR: unable to extract video title')
4075         video_title = result.group('title').strip()
4076
4077         # Get the embed page
4078         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4079         if result is None:
4080             raise ExtractorError(u'ERROR: unable to extract embed page')
4081
4082         embed_page_url = result.group(0).strip()
4083         video_id = result.group('videoid')
4084
4085         webpage = self._download_webpage(embed_page_url, video_id)
4086
4087         # Get the video URL
4088         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4089         if result is None:
4090             raise ExtractorError(u'ERROR: unable to extract video url')
4091         video_url = result.group('source')
4092
4093         info = {'id': video_id,
4094                 'url': video_url,
4095                 'title': video_title,
4096                 'ext': 'flv',
4097                 'format': 'flv',
4098                 'player_url': embed_page_url}
4099
4100         return [info]
4101
4102 class EightTracksIE(InfoExtractor):
4103     IE_NAME = '8tracks'
4104     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4105
4106     def _real_extract(self, url):
4107         mobj = re.match(self._VALID_URL, url)
4108         if mobj is None:
4109             raise ExtractorError(u'Invalid URL: %s' % url)
4110         playlist_id = mobj.group('id')
4111
4112         webpage = self._download_webpage(url, playlist_id)
4113
4114         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4115         if not m:
4116             raise ExtractorError(u'Cannot find trax information')
4117         json_like = m.group(1)
4118         data = json.loads(json_like)
4119
4120         session = str(random.randint(0, 1000000000))
4121         mix_id = data['id']
4122         track_count = data['tracks_count']
4123         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4124         next_url = first_url
4125         res = []
4126         for i in itertools.count():
4127             api_json = self._download_webpage(next_url, playlist_id,
4128                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4129                 errnote=u'Failed to download song information')
4130             api_data = json.loads(api_json)
4131             track_data = api_data[u'set']['track']
4132             info = {
4133                 'id': track_data['id'],
4134                 'url': track_data['track_file_stream_url'],
4135                 'title': track_data['performer'] + u' - ' + track_data['name'],
4136                 'raw_title': track_data['name'],
4137                 'uploader_id': data['user']['login'],
4138                 'ext': 'm4a',
4139             }
4140             res.append(info)
4141             if api_data['set']['at_last_track']:
4142                 break
4143             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4144         return res
4145
4146 class KeekIE(InfoExtractor):
4147     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4148     IE_NAME = u'keek'
4149
4150     def _real_extract(self, url):
4151         m = re.match(self._VALID_URL, url)
4152         video_id = m.group('videoID')
4153         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4154         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4155         webpage = self._download_webpage(url, video_id)
4156         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4157         title = unescapeHTML(m.group('title'))
4158         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4159         uploader = clean_html(m.group('uploader'))
4160         info = {
4161                 'id': video_id,
4162                 'url': video_url,
4163                 'ext': 'mp4',
4164                 'title': title,
4165                 'thumbnail': thumbnail,
4166                 'uploader': uploader
4167         }
4168         return [info]
4169
4170 class TEDIE(InfoExtractor):
4171     _VALID_URL=r'''http://www.ted.com/
4172                    (
4173                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4174                         |
4175                         ((?P<type_talk>talks)) # We have a simple talk
4176                    )
4177                    /(?P<name>\w+) # Here goes the name and then ".html"
4178                    '''
4179
4180     @classmethod
4181     def suitable(cls, url):
4182         """Receives a URL and returns True if suitable for this IE."""
4183         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4184
4185     def _real_extract(self, url):
4186         m=re.match(self._VALID_URL, url, re.VERBOSE)
4187         if m.group('type_talk'):
4188             return [self._talk_info(url)]
4189         else :
4190             playlist_id=m.group('playlist_id')
4191             name=m.group('name')
4192             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4193             return self._playlist_videos_info(url,name,playlist_id)
4194
4195     def _talk_video_link(self,mediaSlug):
4196         '''Returns the video link for that mediaSlug'''
4197         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4198
4199     def _playlist_videos_info(self,url,name,playlist_id=0):
4200         '''Returns the videos of the playlist'''
4201         video_RE=r'''
4202                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4203                      ([.\s]*?)data-playlist_item_id="(\d+)"
4204                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4205                      '''
4206         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4207         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4208         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4209         m_names=re.finditer(video_name_RE,webpage)
4210         info=[]
4211         for m_video, m_name in zip(m_videos,m_names):
4212             video_id=m_video.group('video_id')
4213             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4214             info.append(self._talk_info(talk_url,video_id))
4215         return info
4216
4217     def _talk_info(self, url, video_id=0):
4218         """Return the video for the talk in the url"""
4219         m=re.match(self._VALID_URL, url,re.VERBOSE)
4220         videoName=m.group('name')
4221         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4222         # If the url includes the language we get the title translated
4223         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4224         title=re.search(title_RE, webpage).group('title')
4225         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4226                         "id":(?P<videoID>[\d]+).*?
4227                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4228         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4229         thumb_match=re.search(thumb_RE,webpage)
4230         info_match=re.search(info_RE,webpage,re.VERBOSE)
4231         video_id=info_match.group('videoID')
4232         mediaSlug=info_match.group('mediaSlug')
4233         video_url=self._talk_video_link(mediaSlug)
4234         info = {
4235                 'id': video_id,
4236                 'url': video_url,
4237                 'ext': 'mp4',
4238                 'title': title,
4239                 'thumbnail': thumb_match.group('thumbnail')
4240                 }
4241         return info
4242
4243 class MySpassIE(InfoExtractor):
4244     _VALID_URL = r'http://www.myspass.de/.*'
4245
4246     def _real_extract(self, url):
4247         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4248
4249         # video id is the last path element of the URL
4250         # usually there is a trailing slash, so also try the second but last
4251         url_path = compat_urllib_parse_urlparse(url).path
4252         url_parent_path, video_id = os.path.split(url_path)
4253         if not video_id:
4254             _, video_id = os.path.split(url_parent_path)
4255
4256         # get metadata
4257         metadata_url = META_DATA_URL_TEMPLATE % video_id
4258         metadata_text = self._download_webpage(metadata_url, video_id)
4259         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4260
4261         # extract values from metadata
4262         url_flv_el = metadata.find('url_flv')
4263         if url_flv_el is None:
4264             self._downloader.report_error(u'unable to extract download url')
4265             return
4266         video_url = url_flv_el.text
4267         extension = os.path.splitext(video_url)[1][1:]
4268         title_el = metadata.find('title')
4269         if title_el is None:
4270             self._downloader.report_error(u'unable to extract title')
4271             return
4272         title = title_el.text
4273         format_id_el = metadata.find('format_id')
4274         if format_id_el is None:
4275             format = ext
4276         else:
4277             format = format_id_el.text
4278         description_el = metadata.find('description')
4279         if description_el is not None:
4280             description = description_el.text
4281         else:
4282             description = None
4283         imagePreview_el = metadata.find('imagePreview')
4284         if imagePreview_el is not None:
4285             thumbnail = imagePreview_el.text
4286         else:
4287             thumbnail = None
4288         info = {
4289             'id': video_id,
4290             'url': video_url,
4291             'title': title,
4292             'ext': extension,
4293             'format': format,
4294             'thumbnail': thumbnail,
4295             'description': description
4296         }
4297         return [info]
4298
4299 class SpiegelIE(InfoExtractor):
4300     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4301
4302     def _real_extract(self, url):
4303         m = re.match(self._VALID_URL, url)
4304         video_id = m.group('videoID')
4305
4306         webpage = self._download_webpage(url, video_id)
4307         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4308         if not m:
4309             raise ExtractorError(u'Cannot find title')
4310         video_title = unescapeHTML(m.group(1))
4311
4312         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4313         xml_code = self._download_webpage(xml_url, video_id,
4314                     note=u'Downloading XML', errnote=u'Failed to download XML')
4315
4316         idoc = xml.etree.ElementTree.fromstring(xml_code)
4317         last_type = idoc[-1]
4318         filename = last_type.findall('./filename')[0].text
4319         duration = float(last_type.findall('./duration')[0].text)
4320
4321         video_url = 'http://video2.spiegel.de/flash/' + filename
4322         video_ext = filename.rpartition('.')[2]
4323         info = {
4324             'id': video_id,
4325             'url': video_url,
4326             'ext': video_ext,
4327             'title': video_title,
4328             'duration': duration,
4329         }
4330         return [info]
4331
4332 class LiveLeakIE(InfoExtractor):
4333
4334     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4335     IE_NAME = u'liveleak'
4336
4337     def _real_extract(self, url):
4338         mobj = re.match(self._VALID_URL, url)
4339         if mobj is None:
4340             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4341             return
4342
4343         video_id = mobj.group('video_id')
4344
4345         webpage = self._download_webpage(url, video_id)
4346
4347         m = re.search(r'file: "(.*?)",', webpage)
4348         if not m:
4349             self._downloader.report_error(u'unable to find video url')
4350             return
4351         video_url = m.group(1)
4352
4353         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4354         if not m:
4355             self._downloader.trouble(u'Cannot find video title')
4356         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4357
4358         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4359         if m:
4360             desc = unescapeHTML(m.group('desc'))
4361         else:
4362             desc = None
4363
4364         m = re.search(r'By:.*?(\w+)</a>', webpage)
4365         if m:
4366             uploader = clean_html(m.group(1))
4367         else:
4368             uploader = None
4369
4370         info = {
4371             'id':  video_id,
4372             'url': video_url,
4373             'ext': 'mp4',
4374             'title': title,
4375             'description': desc,
4376             'uploader': uploader
4377         }
4378
4379         return [info]
4380
4381
4382 def gen_extractors():
4383     """ Return a list of an instance of every supported extractor.
4384     The order does matter; the first extractor matched is the one handling the URL.
4385     """
4386     return [
4387         YoutubePlaylistIE(),
4388         YoutubeChannelIE(),
4389         YoutubeUserIE(),
4390         YoutubeSearchIE(),
4391         YoutubeIE(),
4392         MetacafeIE(),
4393         DailymotionIE(),
4394         GoogleSearchIE(),
4395         PhotobucketIE(),
4396         YahooIE(),
4397         YahooSearchIE(),
4398         DepositFilesIE(),
4399         FacebookIE(),
4400         BlipTVUserIE(),
4401         BlipTVIE(),
4402         VimeoIE(),
4403         MyVideoIE(),
4404         ComedyCentralIE(),
4405         EscapistIE(),
4406         CollegeHumorIE(),
4407         XVideosIE(),
4408         SoundcloudSetIE(),
4409         SoundcloudIE(),
4410         InfoQIE(),
4411         MixcloudIE(),
4412         StanfordOpenClassroomIE(),
4413         MTVIE(),
4414         YoukuIE(),
4415         XNXXIE(),
4416         YouJizzIE(),
4417         PornotubeIE(),
4418         YouPornIE(),
4419         GooglePlusIE(),
4420         ArteTvIE(),
4421         NBAIE(),
4422         WorldStarHipHopIE(),
4423         JustinTVIE(),
4424         FunnyOrDieIE(),
4425         SteamIE(),
4426         UstreamIE(),
4427         RBMARadioIE(),
4428         EightTracksIE(),
4429         KeekIE(),
4430         TEDIE(),
4431         MySpassIE(),
4432         SpiegelIE(),
4433         LiveLeakIE(),
4434         GenericIE()
4435     ]