Get the title of Youtube playlists
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self._downloader.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146         
147     #Methods for following #608
148     #They set the correct value of the '_type' key
149     def video_result(self, video_info):
150         """Returns a video"""
151         video_info['_type'] = 'video'
152         return video_info
153     def url_result(self, url, ie=None):
154         """Returns a url that points to a page that should be processed"""
155         #TODO: ie should be the class used for getting the info
156         video_info = {'_type': 'url',
157                       'url': url,
158                       'ie_key': ie}
159         return video_info
160     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
161         """Returns a playlist"""
162         video_info = {'_type': 'playlist',
163                       'entries': entries}
164         if playlist_id:
165             video_info['id'] = playlist_id
166         if playlist_title:
167             video_info['title'] = playlist_title
168         return video_info
169
170
171 class YoutubeIE(InfoExtractor):
172     """Information extractor for youtube.com."""
173
174     _VALID_URL = r"""^
175                      (
176                          (?:https?://)?                                       # http(s):// (optional)
177                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
178                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
179                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
180                          (?:                                                  # the various things that can precede the ID:
181                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
182                              |(?:                                             # or the v= param in all its forms
183                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
184                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
185                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
186                                  v=
187                              )
188                          )?                                                   # optional -> youtube.com/xxxx is OK
189                      )?                                                       # all until now is optional -> you can pass the naked ID
190                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
191                      (?(1).+)?                                                # if we found the ID, everything can follow
192                      $"""
193     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
194     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
195     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
196     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
197     _NETRC_MACHINE = 'youtube'
198     # Listed in order of quality
199     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
200     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
201     _video_extensions = {
202         '13': '3gp',
203         '17': 'mp4',
204         '18': 'mp4',
205         '22': 'mp4',
206         '37': 'mp4',
207         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
208         '43': 'webm',
209         '44': 'webm',
210         '45': 'webm',
211         '46': 'webm',
212     }
213     _video_dimensions = {
214         '5': '240x400',
215         '6': '???',
216         '13': '???',
217         '17': '144x176',
218         '18': '360x640',
219         '22': '720x1280',
220         '34': '360x640',
221         '35': '480x854',
222         '37': '1080x1920',
223         '38': '3072x4096',
224         '43': '360x640',
225         '44': '480x854',
226         '45': '720x1280',
227         '46': '1080x1920',
228     }
229     IE_NAME = u'youtube'
230
231     @classmethod
232     def suitable(cls, url):
233         """Receives a URL and returns True if suitable for this IE."""
234         if YoutubePlaylistIE.suitable(url): return False
235         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
236
237     def report_lang(self):
238         """Report attempt to set language."""
239         self._downloader.to_screen(u'[youtube] Setting language')
240
241     def report_login(self):
242         """Report attempt to log in."""
243         self._downloader.to_screen(u'[youtube] Logging in')
244
245     def report_age_confirmation(self):
246         """Report attempt to confirm age."""
247         self._downloader.to_screen(u'[youtube] Confirming age')
248
249     def report_video_webpage_download(self, video_id):
250         """Report attempt to download video webpage."""
251         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
252
253     def report_video_info_webpage_download(self, video_id):
254         """Report attempt to download video info webpage."""
255         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
256
257     def report_video_subtitles_download(self, video_id):
258         """Report attempt to download video info webpage."""
259         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
260
261     def report_video_subtitles_request(self, video_id, sub_lang, format):
262         """Report attempt to download video info webpage."""
263         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
264
265     def report_video_subtitles_available(self, video_id, sub_lang_list):
266         """Report available subtitles."""
267         sub_lang = ",".join(list(sub_lang_list.keys()))
268         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
269
270     def report_information_extraction(self, video_id):
271         """Report attempt to extract video information."""
272         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
273
274     def report_unavailable_format(self, video_id, format):
275         """Report extracted video URL."""
276         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
277
278     def report_rtmp_download(self):
279         """Indicate the download will use the RTMP protocol."""
280         self._downloader.to_screen(u'[youtube] RTMP download detected')
281
282     def _get_available_subtitles(self, video_id):
283         self.report_video_subtitles_download(video_id)
284         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
285         try:
286             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
287         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
288             return (u'unable to download video subtitles: %s' % compat_str(err), None)
289         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
290         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
291         if not sub_lang_list:
292             return (u'video doesn\'t have subtitles', None)
293         return sub_lang_list
294
295     def _list_available_subtitles(self, video_id):
296         sub_lang_list = self._get_available_subtitles(video_id)
297         self.report_video_subtitles_available(video_id, sub_lang_list)
298
299     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
300         """
301         Return tuple:
302         (error_message, sub_lang, sub)
303         """
304         self.report_video_subtitles_request(video_id, sub_lang, format)
305         params = compat_urllib_parse.urlencode({
306             'lang': sub_lang,
307             'name': sub_name,
308             'v': video_id,
309             'fmt': format,
310         })
311         url = 'http://www.youtube.com/api/timedtext?' + params
312         try:
313             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
316         if not sub:
317             return (u'Did not fetch video subtitles', None, None)
318         return (None, sub_lang, sub)
319
320     def _extract_subtitle(self, video_id):
321         """
322         Return a list with a tuple:
323         [(error_message, sub_lang, sub)]
324         """
325         sub_lang_list = self._get_available_subtitles(video_id)
326         sub_format = self._downloader.params.get('subtitlesformat')
327         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
328             return [(sub_lang_list[0], None, None)]
329         if self._downloader.params.get('subtitleslang', False):
330             sub_lang = self._downloader.params.get('subtitleslang')
331         elif 'en' in sub_lang_list:
332             sub_lang = 'en'
333         else:
334             sub_lang = list(sub_lang_list.keys())[0]
335         if not sub_lang in sub_lang_list:
336             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
337
338         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339         return [subtitle]
340
341     def _extract_all_subtitles(self, video_id):
342         sub_lang_list = self._get_available_subtitles(video_id)
343         sub_format = self._downloader.params.get('subtitlesformat')
344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345             return [(sub_lang_list[0], None, None)]
346         subtitles = []
347         for sub_lang in sub_lang_list:
348             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349             subtitles.append(subtitle)
350         return subtitles
351
352     def _print_formats(self, formats):
353         print('Available formats:')
354         for x in formats:
355             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
356
357     def _real_initialize(self):
358         if self._downloader is None:
359             return
360
361         username = None
362         password = None
363         downloader_params = self._downloader.params
364
365         # Attempt to use provided username and password or .netrc data
366         if downloader_params.get('username', None) is not None:
367             username = downloader_params['username']
368             password = downloader_params['password']
369         elif downloader_params.get('usenetrc', False):
370             try:
371                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
372                 if info is not None:
373                     username = info[0]
374                     password = info[2]
375                 else:
376                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
377             except (IOError, netrc.NetrcParseError) as err:
378                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
379                 return
380
381         # Set language
382         request = compat_urllib_request.Request(self._LANG_URL)
383         try:
384             self.report_lang()
385             compat_urllib_request.urlopen(request).read()
386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
388             return
389
390         # No authentication to be performed
391         if username is None:
392             return
393
394         request = compat_urllib_request.Request(self._LOGIN_URL)
395         try:
396             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
397         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
398             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
399             return
400
401         galx = None
402         dsh = None
403         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
404         if match:
405           galx = match.group(1)
406
407         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
408         if match:
409           dsh = match.group(1)
410
411         # Log in
412         login_form_strs = {
413                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
414                 u'Email': username,
415                 u'GALX': galx,
416                 u'Passwd': password,
417                 u'PersistentCookie': u'yes',
418                 u'_utf8': u'霱',
419                 u'bgresponse': u'js_disabled',
420                 u'checkConnection': u'',
421                 u'checkedDomains': u'youtube',
422                 u'dnConn': u'',
423                 u'dsh': dsh,
424                 u'pstMsg': u'0',
425                 u'rmShown': u'1',
426                 u'secTok': u'',
427                 u'signIn': u'Sign in',
428                 u'timeStmp': u'',
429                 u'service': u'youtube',
430                 u'uilel': u'3',
431                 u'hl': u'en_US',
432         }
433         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
434         # chokes on unicode
435         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
436         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
437         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
438         try:
439             self.report_login()
440             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
441             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
442                 self._downloader.report_warning(u'unable to log in: bad username or password')
443                 return
444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
446             return
447
448         # Confirm age
449         age_form = {
450                 'next_url':     '/',
451                 'action_confirm':   'Confirm',
452                 }
453         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
454         try:
455             self.report_age_confirmation()
456             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
457         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
458             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
459             return
460
461     def _extract_id(self, url):
462         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
463         if mobj is None:
464             self._downloader.report_error(u'invalid URL: %s' % url)
465             return
466         video_id = mobj.group(2)
467         return video_id
468
469     def _real_extract(self, url):
470         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
471         mobj = re.search(self._NEXT_URL_RE, url)
472         if mobj:
473             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
474         video_id = self._extract_id(url)
475
476         # Get video webpage
477         self.report_video_webpage_download(video_id)
478         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
479         request = compat_urllib_request.Request(url)
480         try:
481             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
483             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
484             return
485
486         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
487
488         # Attempt to extract SWF player URL
489         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
490         if mobj is not None:
491             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
492         else:
493             player_url = None
494
495         # Get video info
496         self.report_video_info_webpage_download(video_id)
497         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
498             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
499                     % (video_id, el_type))
500             video_info_webpage = self._download_webpage(video_info_url, video_id,
501                                     note=False,
502                                     errnote='unable to download video info webpage')
503             video_info = compat_parse_qs(video_info_webpage)
504             if 'token' in video_info:
505                 break
506         if 'token' not in video_info:
507             if 'reason' in video_info:
508                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
509             else:
510                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
511             return
512
513         # Check for "rental" videos
514         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
515             self._downloader.report_error(u'"rental" videos not supported')
516             return
517
518         # Start extracting information
519         self.report_information_extraction(video_id)
520
521         # uploader
522         if 'author' not in video_info:
523             self._downloader.report_error(u'unable to extract uploader name')
524             return
525         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
526
527         # uploader_id
528         video_uploader_id = None
529         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
530         if mobj is not None:
531             video_uploader_id = mobj.group(1)
532         else:
533             self._downloader.report_warning(u'unable to extract uploader nickname')
534
535         # title
536         if 'title' not in video_info:
537             self._downloader.report_error(u'unable to extract video title')
538             return
539         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
540
541         # thumbnail image
542         if 'thumbnail_url' not in video_info:
543             self._downloader.report_warning(u'unable to extract video thumbnail')
544             video_thumbnail = ''
545         else:   # don't panic if we can't find it
546             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
547
548         # upload date
549         upload_date = None
550         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
551         if mobj is not None:
552             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
553             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
554             for expression in format_expressions:
555                 try:
556                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
557                 except:
558                     pass
559
560         # description
561         video_description = get_element_by_id("eow-description", video_webpage)
562         if video_description:
563             video_description = clean_html(video_description)
564         else:
565             video_description = ''
566
567         # subtitles
568         video_subtitles = None
569
570         if self._downloader.params.get('writesubtitles', False):
571             video_subtitles = self._extract_subtitle(video_id)
572             if video_subtitles:
573                 (sub_error, sub_lang, sub) = video_subtitles[0]
574                 if sub_error:
575                     self._downloader.report_error(sub_error)
576
577         if self._downloader.params.get('allsubtitles', False):
578             video_subtitles = self._extract_all_subtitles(video_id)
579             for video_subtitle in video_subtitles:
580                 (sub_error, sub_lang, sub) = video_subtitle
581                 if sub_error:
582                     self._downloader.report_error(sub_error)
583
584         if self._downloader.params.get('listsubtitles', False):
585             sub_lang_list = self._list_available_subtitles(video_id)
586             return
587
588         if 'length_seconds' not in video_info:
589             self._downloader.report_warning(u'unable to extract video duration')
590             video_duration = ''
591         else:
592             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
593
594         # token
595         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
596
597         # Decide which formats to download
598         req_format = self._downloader.params.get('format', None)
599
600         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
601             self.report_rtmp_download()
602             video_url_list = [(None, video_info['conn'][0])]
603         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
604             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
605             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
606             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
607             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
608
609             format_limit = self._downloader.params.get('format_limit', None)
610             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
611             if format_limit is not None and format_limit in available_formats:
612                 format_list = available_formats[available_formats.index(format_limit):]
613             else:
614                 format_list = available_formats
615             existing_formats = [x for x in format_list if x in url_map]
616             if len(existing_formats) == 0:
617                 self._downloader.report_error(u'no known formats available for video')
618                 return
619             if self._downloader.params.get('listformats', None):
620                 self._print_formats(existing_formats)
621                 return
622             if req_format is None or req_format == 'best':
623                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
624             elif req_format == 'worst':
625                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
626             elif req_format in ('-1', 'all'):
627                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
628             else:
629                 # Specific formats. We pick the first in a slash-delimeted sequence.
630                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
631                 req_formats = req_format.split('/')
632                 video_url_list = None
633                 for rf in req_formats:
634                     if rf in url_map:
635                         video_url_list = [(rf, url_map[rf])]
636                         break
637                 if video_url_list is None:
638                     self._downloader.report_error(u'requested format not available')
639                     return
640         else:
641             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
642             return
643
644         results = []
645         for format_param, video_real_url in video_url_list:
646             # Extension
647             video_extension = self._video_extensions.get(format_param, 'flv')
648
649             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
650                                               self._video_dimensions.get(format_param, '???'))
651
652             results.append({
653                 'id':       video_id,
654                 'url':      video_real_url,
655                 'uploader': video_uploader,
656                 'uploader_id': video_uploader_id,
657                 'upload_date':  upload_date,
658                 'title':    video_title,
659                 'ext':      video_extension,
660                 'format':   video_format,
661                 'thumbnail':    video_thumbnail,
662                 'description':  video_description,
663                 'player_url':   player_url,
664                 'subtitles':    video_subtitles,
665                 'duration':     video_duration
666             })
667         return results
668
669
670 class MetacafeIE(InfoExtractor):
671     """Information Extractor for metacafe.com."""
672
673     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
674     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
675     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
676     IE_NAME = u'metacafe'
677
678     def __init__(self, downloader=None):
679         InfoExtractor.__init__(self, downloader)
680
681     def report_disclaimer(self):
682         """Report disclaimer retrieval."""
683         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
684
685     def report_age_confirmation(self):
686         """Report attempt to confirm age."""
687         self._downloader.to_screen(u'[metacafe] Confirming age')
688
689     def report_download_webpage(self, video_id):
690         """Report webpage download."""
691         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
692
693     def report_extraction(self, video_id):
694         """Report information extraction."""
695         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
696
697     def _real_initialize(self):
698         # Retrieve disclaimer
699         request = compat_urllib_request.Request(self._DISCLAIMER)
700         try:
701             self.report_disclaimer()
702             disclaimer = compat_urllib_request.urlopen(request).read()
703         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
705             return
706
707         # Confirm age
708         disclaimer_form = {
709             'filters': '0',
710             'submit': "Continue - I'm over 18",
711             }
712         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
713         try:
714             self.report_age_confirmation()
715             disclaimer = compat_urllib_request.urlopen(request).read()
716         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
717             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
718             return
719
720     def _real_extract(self, url):
721         # Extract id and simplified title from URL
722         mobj = re.match(self._VALID_URL, url)
723         if mobj is None:
724             self._downloader.report_error(u'invalid URL: %s' % url)
725             return
726
727         video_id = mobj.group(1)
728
729         # Check if video comes from YouTube
730         mobj2 = re.match(r'^yt-(.*)$', video_id)
731         if mobj2 is not None:
732             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
733
734         # Retrieve video webpage to extract further information
735         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
736
737         # Extract URL, uploader and title from webpage
738         self.report_extraction(video_id)
739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
740         if mobj is not None:
741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742             video_extension = mediaURL[-3:]
743
744             # Extract gdaKey if available
745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
746             if mobj is None:
747                 video_url = mediaURL
748             else:
749                 gdaKey = mobj.group(1)
750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
751         else:
752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
753             if mobj is None:
754                 self._downloader.report_error(u'unable to extract media URL')
755                 return
756             vardict = compat_parse_qs(mobj.group(1))
757             if 'mediaData' not in vardict:
758                 self._downloader.report_error(u'unable to extract media URL')
759                 return
760             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
761             if mobj is None:
762                 self._downloader.report_error(u'unable to extract media URL')
763                 return
764             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
765             video_extension = mediaURL[-3:]
766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
767
768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
769         if mobj is None:
770             self._downloader.report_error(u'unable to extract title')
771             return
772         video_title = mobj.group(1).decode('utf-8')
773
774         mobj = re.search(r'submitter=(.*?);', webpage)
775         if mobj is None:
776             self._downloader.report_error(u'unable to extract uploader nickname')
777             return
778         video_uploader = mobj.group(1)
779
780         return [{
781             'id':       video_id.decode('utf-8'),
782             'url':      video_url.decode('utf-8'),
783             'uploader': video_uploader.decode('utf-8'),
784             'upload_date':  None,
785             'title':    video_title,
786             'ext':      video_extension.decode('utf-8'),
787         }]
788
789
790 class DailymotionIE(InfoExtractor):
791     """Information Extractor for Dailymotion"""
792
793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794     IE_NAME = u'dailymotion'
795     _WORKING = False
796
797     def __init__(self, downloader=None):
798         InfoExtractor.__init__(self, downloader)
799
800     def report_extraction(self, video_id):
801         """Report information extraction."""
802         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
803
804     def _real_extract(self, url):
805         # Extract id and simplified title from URL
806         mobj = re.match(self._VALID_URL, url)
807         if mobj is None:
808             self._downloader.report_error(u'invalid URL: %s' % url)
809             return
810
811         video_id = mobj.group(1).split('_')[0].split('?')[0]
812
813         video_extension = 'mp4'
814
815         # Retrieve video webpage to extract further information
816         request = compat_urllib_request.Request(url)
817         request.add_header('Cookie', 'family_filter=off')
818         webpage = self._download_webpage(request, video_id)
819
820         # Extract URL, uploader and title from webpage
821         self.report_extraction(video_id)
822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
823         if mobj is None:
824             self._downloader.report_error(u'unable to extract media URL')
825             return
826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
827
828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
829             if key in flashvars:
830                 max_quality = key
831                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
832                 break
833         else:
834             self._downloader.report_error(u'unable to extract video URL')
835             return
836
837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
838         if mobj is None:
839             self._downloader.report_error(u'unable to extract video URL')
840             return
841
842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
843
844         # TODO: support choosing qualities
845
846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
847         if mobj is None:
848             self._downloader.report_error(u'unable to extract title')
849             return
850         video_title = unescapeHTML(mobj.group('title'))
851
852         video_uploader = None
853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
854         if mobj is None:
855             # lookin for official user
856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857             if mobj_official is None:
858                 self._downloader.report_warning(u'unable to extract uploader nickname')
859             else:
860                 video_uploader = mobj_official.group(1)
861         else:
862             video_uploader = mobj.group(1)
863
864         video_upload_date = None
865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
866         if mobj is not None:
867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
868
869         return [{
870             'id':       video_id,
871             'url':      video_url,
872             'uploader': video_uploader,
873             'upload_date':  video_upload_date,
874             'title':    video_title,
875             'ext':      video_extension,
876         }]
877
878
879 class PhotobucketIE(InfoExtractor):
880     """Information extractor for photobucket.com."""
881
882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883     IE_NAME = u'photobucket'
884
885     def __init__(self, downloader=None):
886         InfoExtractor.__init__(self, downloader)
887
888     def report_download_webpage(self, video_id):
889         """Report webpage download."""
890         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
891
892     def report_extraction(self, video_id):
893         """Report information extraction."""
894         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
895
896     def _real_extract(self, url):
897         # Extract id from URL
898         mobj = re.match(self._VALID_URL, url)
899         if mobj is None:
900             self._downloader.report_error(u'Invalid URL: %s' % url)
901             return
902
903         video_id = mobj.group(1)
904
905         video_extension = 'flv'
906
907         # Retrieve video webpage to extract further information
908         request = compat_urllib_request.Request(url)
909         try:
910             self.report_download_webpage(video_id)
911             webpage = compat_urllib_request.urlopen(request).read()
912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
914             return
915
916         # Extract URL, uploader, and title from webpage
917         self.report_extraction(video_id)
918         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
919         if mobj is None:
920             self._downloader.report_error(u'unable to extract media URL')
921             return
922         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
923
924         video_url = mediaURL
925
926         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
927         if mobj is None:
928             self._downloader.report_error(u'unable to extract title')
929             return
930         video_title = mobj.group(1).decode('utf-8')
931
932         video_uploader = mobj.group(2).decode('utf-8')
933
934         return [{
935             'id':       video_id.decode('utf-8'),
936             'url':      video_url.decode('utf-8'),
937             'uploader': video_uploader,
938             'upload_date':  None,
939             'title':    video_title,
940             'ext':      video_extension.decode('utf-8'),
941         }]
942
943
944 class YahooIE(InfoExtractor):
945     """Information extractor for video.yahoo.com."""
946
947     _WORKING = False
948     # _VALID_URL matches all Yahoo! Video URLs
949     # _VPAGE_URL matches only the extractable '/watch/' URLs
950     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952     IE_NAME = u'video.yahoo'
953
954     def __init__(self, downloader=None):
955         InfoExtractor.__init__(self, downloader)
956
957     def report_download_webpage(self, video_id):
958         """Report webpage download."""
959         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
960
961     def report_extraction(self, video_id):
962         """Report information extraction."""
963         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
964
965     def _real_extract(self, url, new_video=True):
966         # Extract ID from URL
967         mobj = re.match(self._VALID_URL, url)
968         if mobj is None:
969             self._downloader.report_error(u'Invalid URL: %s' % url)
970             return
971
972         video_id = mobj.group(2)
973         video_extension = 'flv'
974
975         # Rewrite valid but non-extractable URLs as
976         # extractable English language /watch/ URLs
977         if re.match(self._VPAGE_URL, url) is None:
978             request = compat_urllib_request.Request(url)
979             try:
980                 webpage = compat_urllib_request.urlopen(request).read()
981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
983                 return
984
985             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
986             if mobj is None:
987                 self._downloader.report_error(u'Unable to extract id field')
988                 return
989             yahoo_id = mobj.group(1)
990
991             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
992             if mobj is None:
993                 self._downloader.report_error(u'Unable to extract vid field')
994                 return
995             yahoo_vid = mobj.group(1)
996
997             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998             return self._real_extract(url, new_video=False)
999
1000         # Retrieve video webpage to extract further information
1001         request = compat_urllib_request.Request(url)
1002         try:
1003             self.report_download_webpage(video_id)
1004             webpage = compat_urllib_request.urlopen(request).read()
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Extract uploader and title from webpage
1010         self.report_extraction(video_id)
1011         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1012         if mobj is None:
1013             self._downloader.report_error(u'unable to extract video title')
1014             return
1015         video_title = mobj.group(1).decode('utf-8')
1016
1017         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video uploader')
1020             return
1021         video_uploader = mobj.group(1).decode('utf-8')
1022
1023         # Extract video thumbnail
1024         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video thumbnail')
1027             return
1028         video_thumbnail = mobj.group(1).decode('utf-8')
1029
1030         # Extract video description
1031         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video description')
1034             return
1035         video_description = mobj.group(1).decode('utf-8')
1036         if not video_description:
1037             video_description = 'No description available.'
1038
1039         # Extract video height and width
1040         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1041         if mobj is None:
1042             self._downloader.report_error(u'unable to extract video height')
1043             return
1044         yv_video_height = mobj.group(1)
1045
1046         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1047         if mobj is None:
1048             self._downloader.report_error(u'unable to extract video width')
1049             return
1050         yv_video_width = mobj.group(1)
1051
1052         # Retrieve video playlist to extract media URL
1053         # I'm not completely sure what all these options are, but we
1054         # seem to need most of them, otherwise the server sends a 401.
1055         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1056         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1057         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Extract media URL from playlist XML
1068         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1069         if mobj is None:
1070             self._downloader.report_error(u'Unable to extract media URL')
1071             return
1072         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073         video_url = unescapeHTML(video_url)
1074
1075         return [{
1076             'id':       video_id.decode('utf-8'),
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'upload_date':  None,
1080             'title':    video_title,
1081             'ext':      video_extension.decode('utf-8'),
1082             'thumbnail':    video_thumbnail.decode('utf-8'),
1083             'description':  video_description,
1084         }]
1085
1086
1087 class VimeoIE(InfoExtractor):
1088     """Information extractor for vimeo.com."""
1089
1090     # _VALID_URL matches Vimeo URLs
1091     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092     IE_NAME = u'vimeo'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1104
1105     def _real_extract(self, url, new_video=True):
1106         # Extract ID from URL
1107         mobj = re.match(self._VALID_URL, url)
1108         if mobj is None:
1109             self._downloader.report_error(u'Invalid URL: %s' % url)
1110             return
1111
1112         video_id = mobj.group('id')
1113         if not mobj.group('proto'):
1114             url = 'https://' + url
1115         if mobj.group('direct_link'):
1116             url = 'https://vimeo.com/' + video_id
1117
1118         # Retrieve video webpage to extract further information
1119         request = compat_urllib_request.Request(url, None, std_headers)
1120         try:
1121             self.report_download_webpage(video_id)
1122             webpage_bytes = compat_urllib_request.urlopen(request).read()
1123             webpage = webpage_bytes.decode('utf-8')
1124         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1126             return
1127
1128         # Now we begin extracting as much information as we can from what we
1129         # retrieved. First we extract the information common to all extractors,
1130         # and latter we extract those that are Vimeo specific.
1131         self.report_extraction(video_id)
1132
1133         # Extract the config JSON
1134         try:
1135             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136             config = json.loads(config)
1137         except:
1138             self._downloader.report_error(u'unable to extract info section')
1139             return
1140
1141         # Extract title
1142         video_title = config["video"]["title"]
1143
1144         # Extract uploader and uploader_id
1145         video_uploader = config["video"]["owner"]["name"]
1146         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1147
1148         # Extract video thumbnail
1149         video_thumbnail = config["video"]["thumbnail"]
1150
1151         # Extract video description
1152         video_description = get_element_by_attribute("itemprop", "description", webpage)
1153         if video_description: video_description = clean_html(video_description)
1154         else: video_description = u''
1155
1156         # Extract upload date
1157         video_upload_date = None
1158         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159         if mobj is not None:
1160             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1161
1162         # Vimeo specific: extract request signature and timestamp
1163         sig = config['request']['signature']
1164         timestamp = config['request']['timestamp']
1165
1166         # Vimeo specific: extract video codec and quality information
1167         # First consider quality, then codecs, then take everything
1168         # TODO bind to format param
1169         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170         files = { 'hd': [], 'sd': [], 'other': []}
1171         for codec_name, codec_extension in codecs:
1172             if codec_name in config["video"]["files"]:
1173                 if 'hd' in config["video"]["files"][codec_name]:
1174                     files['hd'].append((codec_name, codec_extension, 'hd'))
1175                 elif 'sd' in config["video"]["files"][codec_name]:
1176                     files['sd'].append((codec_name, codec_extension, 'sd'))
1177                 else:
1178                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1179
1180         for quality in ('hd', 'sd', 'other'):
1181             if len(files[quality]) > 0:
1182                 video_quality = files[quality][0][2]
1183                 video_codec = files[quality][0][0]
1184                 video_extension = files[quality][0][1]
1185                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1186                 break
1187         else:
1188             self._downloader.report_error(u'no known codec found')
1189             return
1190
1191         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1193
1194         return [{
1195             'id':       video_id,
1196             'url':      video_url,
1197             'uploader': video_uploader,
1198             'uploader_id': video_uploader_id,
1199             'upload_date':  video_upload_date,
1200             'title':    video_title,
1201             'ext':      video_extension,
1202             'thumbnail':    video_thumbnail,
1203             'description':  video_description,
1204         }]
1205
1206
1207 class ArteTvIE(InfoExtractor):
1208     """arte.tv information extractor."""
1209
1210     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211     _LIVE_URL = r'index-[0-9]+\.html$'
1212
1213     IE_NAME = u'arte.tv'
1214
1215     def __init__(self, downloader=None):
1216         InfoExtractor.__init__(self, downloader)
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1221
1222     def report_extraction(self, video_id):
1223         """Report information extraction."""
1224         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233             return
1234         except ValueError as err:
1235             self._downloader.report_error(u'Invalid URL: %s' % url)
1236             return
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             self._downloader.report_error(u'Invalid URL: %s' % url)
1246             return
1247
1248         for (i, key, err) in matchTuples:
1249             if mobj.group(i) is None:
1250                 self._downloader.trouble(err)
1251                 return
1252             else:
1253                 info[key] = mobj.group(i)
1254
1255         return info
1256
1257     def extractLiveStream(self, url):
1258         video_lang = url.split('/')[-4]
1259         info = self.grep_webpage(
1260             url,
1261             r'src="(.*?/videothek_js.*?\.js)',
1262             0,
1263             [
1264                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1265             ]
1266         )
1267         http_host = url.split('/')[2]
1268         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269         info = self.grep_webpage(
1270             next_url,
1271             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272                 '(http://.*?\.swf).*?' +
1273                 '(rtmp://.*?)\'',
1274             re.DOTALL,
1275             [
1276                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1277                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1279             ]
1280         )
1281         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1282
1283     def extractPlus7Stream(self, url):
1284         video_lang = url.split('/')[-3]
1285         info = self.grep_webpage(
1286             url,
1287             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288             0,
1289             [
1290                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1291             ]
1292         )
1293         next_url = compat_urllib_parse.unquote(info.get('url'))
1294         info = self.grep_webpage(
1295             next_url,
1296             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297             0,
1298             [
1299                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1300             ]
1301         )
1302         next_url = compat_urllib_parse.unquote(info.get('url'))
1303
1304         info = self.grep_webpage(
1305             next_url,
1306             r'<video id="(.*?)".*?>.*?' +
1307                 '<name>(.*?)</name>.*?' +
1308                 '<dateVideo>(.*?)</dateVideo>.*?' +
1309                 '<url quality="hd">(.*?)</url>',
1310             re.DOTALL,
1311             [
1312                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1313                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1315                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1316             ]
1317         )
1318
1319         return {
1320             'id':           info.get('id'),
1321             'url':          compat_urllib_parse.unquote(info.get('url')),
1322             'uploader':     u'arte.tv',
1323             'upload_date':  info.get('date'),
1324             'title':        info.get('title').decode('utf-8'),
1325             'ext':          u'mp4',
1326             'format':       u'NA',
1327             'player_url':   None,
1328         }
1329
1330     def _real_extract(self, url):
1331         video_id = url.split('/')[-1]
1332         self.report_extraction(video_id)
1333
1334         if re.search(self._LIVE_URL, video_id) is not None:
1335             self.extractLiveStream(url)
1336             return
1337         else:
1338             info = self.extractPlus7Stream(url)
1339
1340         return [info]
1341
1342
1343 class GenericIE(InfoExtractor):
1344     """Generic last-resort information extractor."""
1345
1346     _VALID_URL = r'.*'
1347     IE_NAME = u'generic'
1348
1349     def __init__(self, downloader=None):
1350         InfoExtractor.__init__(self, downloader)
1351
1352     def report_download_webpage(self, video_id):
1353         """Report webpage download."""
1354         if not self._downloader.params.get('test', False):
1355             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1357
1358     def report_extraction(self, video_id):
1359         """Report information extraction."""
1360         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1361
1362     def report_following_redirect(self, new_url):
1363         """Report information extraction."""
1364         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1365
1366     def _test_redirect(self, url):
1367         """Check if it is a redirect, like url shorteners, in case return the new url."""
1368         class HeadRequest(compat_urllib_request.Request):
1369             def get_method(self):
1370                 return "HEAD"
1371
1372         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1373             """
1374             Subclass the HTTPRedirectHandler to make it use our
1375             HeadRequest also on the redirected URL
1376             """
1377             def redirect_request(self, req, fp, code, msg, headers, newurl):
1378                 if code in (301, 302, 303, 307):
1379                     newurl = newurl.replace(' ', '%20')
1380                     newheaders = dict((k,v) for k,v in req.headers.items()
1381                                       if k.lower() not in ("content-length", "content-type"))
1382                     return HeadRequest(newurl,
1383                                        headers=newheaders,
1384                                        origin_req_host=req.get_origin_req_host(),
1385                                        unverifiable=True)
1386                 else:
1387                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1388
1389         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1390             """
1391             Fallback to GET if HEAD is not allowed (405 HTTP error)
1392             """
1393             def http_error_405(self, req, fp, code, msg, headers):
1394                 fp.read()
1395                 fp.close()
1396
1397                 newheaders = dict((k,v) for k,v in req.headers.items()
1398                                   if k.lower() not in ("content-length", "content-type"))
1399                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1400                                                  headers=newheaders,
1401                                                  origin_req_host=req.get_origin_req_host(),
1402                                                  unverifiable=True))
1403
1404         # Build our opener
1405         opener = compat_urllib_request.OpenerDirector()
1406         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407                         HTTPMethodFallback, HEADRedirectHandler,
1408                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409             opener.add_handler(handler())
1410
1411         response = opener.open(HeadRequest(url))
1412         new_url = response.geturl()
1413
1414         if url == new_url:
1415             return False
1416
1417         self.report_following_redirect(new_url)
1418         return new_url
1419
1420     def _real_extract(self, url):
1421         new_url = self._test_redirect(url)
1422         if new_url: return [self.url_result(new_url)]
1423
1424         video_id = url.split('/')[-1]
1425         try:
1426             webpage = self._download_webpage(url, video_id)
1427         except ValueError as err:
1428             # since this is the last-resort InfoExtractor, if
1429             # this error is thrown, it'll be thrown here
1430             self._downloader.report_error(u'Invalid URL: %s' % url)
1431             return
1432
1433         self.report_extraction(video_id)
1434         # Start with something easy: JW Player in SWFObject
1435         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1436         if mobj is None:
1437             # Broaden the search a little bit
1438             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1439         if mobj is None:
1440             # Broaden the search a little bit: JWPlayer JS loader
1441             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'Invalid URL: %s' % url)
1444             return
1445
1446         # It's possible that one of the regexes
1447         # matched, but returned an empty group:
1448         if mobj.group(1) is None:
1449             self._downloader.report_error(u'Invalid URL: %s' % url)
1450             return
1451
1452         video_url = compat_urllib_parse.unquote(mobj.group(1))
1453         video_id = os.path.basename(video_url)
1454
1455         # here's a fun little line of code for you:
1456         video_extension = os.path.splitext(video_id)[1][1:]
1457         video_id = os.path.splitext(video_id)[0]
1458
1459         # it's tempting to parse this further, but you would
1460         # have to take into account all the variations like
1461         #   Video Title - Site Name
1462         #   Site Name | Video Title
1463         #   Video Title - Tagline | Site Name
1464         # and so on and so forth; it's just not practical
1465         mobj = re.search(r'<title>(.*)</title>', webpage)
1466         if mobj is None:
1467             self._downloader.report_error(u'unable to extract title')
1468             return
1469         video_title = mobj.group(1)
1470
1471         # video uploader is domain name
1472         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1473         if mobj is None:
1474             self._downloader.report_error(u'unable to extract title')
1475             return
1476         video_uploader = mobj.group(1)
1477
1478         return [{
1479             'id':       video_id,
1480             'url':      video_url,
1481             'uploader': video_uploader,
1482             'upload_date':  None,
1483             'title':    video_title,
1484             'ext':      video_extension,
1485         }]
1486
1487
1488 class YoutubeSearchIE(InfoExtractor):
1489     """Information Extractor for YouTube search queries."""
1490     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492     _max_youtube_results = 1000
1493     IE_NAME = u'youtube:search'
1494
1495     def __init__(self, downloader=None):
1496         InfoExtractor.__init__(self, downloader)
1497
1498     def report_download_page(self, query, pagenum):
1499         """Report attempt to download search page with given number."""
1500         query = query.decode(preferredencoding())
1501         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1502
1503     def _real_extract(self, query):
1504         mobj = re.match(self._VALID_URL, query)
1505         if mobj is None:
1506             self._downloader.report_error(u'invalid search query "%s"' % query)
1507             return
1508
1509         prefix, query = query.split(':')
1510         prefix = prefix[8:]
1511         query = query.encode('utf-8')
1512         if prefix == '':
1513             self._download_n_results(query, 1)
1514             return
1515         elif prefix == 'all':
1516             self._download_n_results(query, self._max_youtube_results)
1517             return
1518         else:
1519             try:
1520                 n = int(prefix)
1521                 if n <= 0:
1522                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1523                     return
1524                 elif n > self._max_youtube_results:
1525                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526                     n = self._max_youtube_results
1527                 self._download_n_results(query, n)
1528                 return
1529             except ValueError: # parsing prefix as integer fails
1530                 self._download_n_results(query, 1)
1531                 return
1532
1533     def _download_n_results(self, query, n):
1534         """Downloads a specified number of results for a query"""
1535
1536         video_ids = []
1537         pagenum = 0
1538         limit = n
1539
1540         while (50 * pagenum) < limit:
1541             self.report_download_page(query, pagenum+1)
1542             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1548                 return
1549             api_response = json.loads(data)['data']
1550
1551             if not 'items' in api_response:
1552                 self._downloader.trouble(u'[youtube] No video results')
1553                 return
1554
1555             new_ids = list(video['id'] for video in api_response['items'])
1556             video_ids += new_ids
1557
1558             limit = min(n, api_response['totalItems'])
1559             pagenum += 1
1560
1561         if len(video_ids) > n:
1562             video_ids = video_ids[:n]
1563         for id in video_ids:
1564             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565         return
1566
1567
1568 class GoogleSearchIE(InfoExtractor):
1569     """Information Extractor for Google Video search queries."""
1570     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574     _max_google_results = 1000
1575     IE_NAME = u'video.google:search'
1576
1577     def __init__(self, downloader=None):
1578         InfoExtractor.__init__(self, downloader)
1579
1580     def report_download_page(self, query, pagenum):
1581         """Report attempt to download playlist page with given number."""
1582         query = query.decode(preferredencoding())
1583         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1584
1585     def _real_extract(self, query):
1586         mobj = re.match(self._VALID_URL, query)
1587         if mobj is None:
1588             self._downloader.report_error(u'invalid search query "%s"' % query)
1589             return
1590
1591         prefix, query = query.split(':')
1592         prefix = prefix[8:]
1593         query = query.encode('utf-8')
1594         if prefix == '':
1595             self._download_n_results(query, 1)
1596             return
1597         elif prefix == 'all':
1598             self._download_n_results(query, self._max_google_results)
1599             return
1600         else:
1601             try:
1602                 n = int(prefix)
1603                 if n <= 0:
1604                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605                     return
1606                 elif n > self._max_google_results:
1607                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608                     n = self._max_google_results
1609                 self._download_n_results(query, n)
1610                 return
1611             except ValueError: # parsing prefix as integer fails
1612                 self._download_n_results(query, 1)
1613                 return
1614
1615     def _download_n_results(self, query, n):
1616         """Downloads a specified number of results for a query"""
1617
1618         video_ids = []
1619         pagenum = 0
1620
1621         while True:
1622             self.report_download_page(query, pagenum)
1623             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624             request = compat_urllib_request.Request(result_url)
1625             try:
1626                 page = compat_urllib_request.urlopen(request).read()
1627             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1629                 return
1630
1631             # Extract video identifiers
1632             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633                 video_id = mobj.group(1)
1634                 if video_id not in video_ids:
1635                     video_ids.append(video_id)
1636                     if len(video_ids) == n:
1637                         # Specified n videos reached
1638                         for id in video_ids:
1639                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1640                         return
1641
1642             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643                 for id in video_ids:
1644                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                 return
1646
1647             pagenum = pagenum + 1
1648
1649
1650 class YahooSearchIE(InfoExtractor):
1651     """Information Extractor for Yahoo! Video search queries."""
1652
1653     _WORKING = False
1654     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657     _MORE_PAGES_INDICATOR = r'\s*Next'
1658     _max_yahoo_results = 1000
1659     IE_NAME = u'video.yahoo:search'
1660
1661     def __init__(self, downloader=None):
1662         InfoExtractor.__init__(self, downloader)
1663
1664     def report_download_page(self, query, pagenum):
1665         """Report attempt to download playlist page with given number."""
1666         query = query.decode(preferredencoding())
1667         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1668
1669     def _real_extract(self, query):
1670         mobj = re.match(self._VALID_URL, query)
1671         if mobj is None:
1672             self._downloader.report_error(u'invalid search query "%s"' % query)
1673             return
1674
1675         prefix, query = query.split(':')
1676         prefix = prefix[8:]
1677         query = query.encode('utf-8')
1678         if prefix == '':
1679             self._download_n_results(query, 1)
1680             return
1681         elif prefix == 'all':
1682             self._download_n_results(query, self._max_yahoo_results)
1683             return
1684         else:
1685             try:
1686                 n = int(prefix)
1687                 if n <= 0:
1688                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1689                     return
1690                 elif n > self._max_yahoo_results:
1691                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692                     n = self._max_yahoo_results
1693                 self._download_n_results(query, n)
1694                 return
1695             except ValueError: # parsing prefix as integer fails
1696                 self._download_n_results(query, 1)
1697                 return
1698
1699     def _download_n_results(self, query, n):
1700         """Downloads a specified number of results for a query"""
1701
1702         video_ids = []
1703         already_seen = set()
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(query, pagenum)
1708             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709             request = compat_urllib_request.Request(result_url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read()
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 video_id = mobj.group(1)
1719                 if video_id not in already_seen:
1720                     video_ids.append(video_id)
1721                     already_seen.add(video_id)
1722                     if len(video_ids) == n:
1723                         # Specified n videos reached
1724                         for id in video_ids:
1725                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1726                         return
1727
1728             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729                 for id in video_ids:
1730                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                 return
1732
1733             pagenum = pagenum + 1
1734
1735
1736 class YoutubePlaylistIE(InfoExtractor):
1737     """Information Extractor for YouTube playlists."""
1738
1739     _VALID_URL = r"""(?:
1740                         (?:https?://)?
1741                         (?:\w+\.)?
1742                         youtube\.com/
1743                         (?:
1744                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745                            \? (?:.*?&)*? (?:p|a|list)=
1746                         |  p/
1747                         )
1748                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1749                         .*
1750                      |
1751                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1752                      )"""
1753     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1754     _MAX_RESULTS = 50
1755     IE_NAME = u'youtube:playlist'
1756
1757     def __init__(self, downloader=None):
1758         InfoExtractor.__init__(self, downloader)
1759
1760     @classmethod
1761     def suitable(cls, url):
1762         """Receives a URL and returns True if suitable for this IE."""
1763         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1764
1765     def report_download_page(self, playlist_id, pagenum):
1766         """Report attempt to download playlist page with given number."""
1767         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1768
1769     def _real_extract(self, url):
1770         # Extract playlist id
1771         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1772         if mobj is None:
1773             self._downloader.report_error(u'invalid url: %s' % url)
1774             return
1775
1776         # Download playlist videos from API
1777         playlist_id = mobj.group(1) or mobj.group(2)
1778         page_num = 1
1779         videos = []
1780
1781         while True:
1782             self.report_download_page(playlist_id, page_num)
1783
1784             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1785             try:
1786                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             try:
1792                 response = json.loads(page)
1793             except ValueError as err:
1794                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1795                 return
1796
1797             if 'feed' not in response:
1798                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1799                 return
1800             if 'entry' not in response['feed']:
1801                 # Number of videos is a multiple of self._MAX_RESULTS
1802                 break
1803
1804             playlist_title = response['feed']['title']['$t']
1805
1806             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1807                         for entry in response['feed']['entry']
1808                         if 'content' in entry ]
1809
1810             if len(response['feed']['entry']) < self._MAX_RESULTS:
1811                 break
1812             page_num += 1
1813
1814         videos = [v[1] for v in sorted(videos)]
1815
1816         url_results = [self.url_result(url, 'Youtube') for url in videos]
1817         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1818
1819
1820 class YoutubeChannelIE(InfoExtractor):
1821     """Information Extractor for YouTube channels."""
1822
1823     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1824     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1825     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1826     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1827     IE_NAME = u'youtube:channel'
1828
1829     def report_download_page(self, channel_id, pagenum):
1830         """Report attempt to download channel page with given number."""
1831         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1832
1833     def extract_videos_from_page(self, page):
1834         ids_in_page = []
1835         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1836             if mobj.group(1) not in ids_in_page:
1837                 ids_in_page.append(mobj.group(1))
1838         return ids_in_page
1839
1840     def _real_extract(self, url):
1841         # Extract channel id
1842         mobj = re.match(self._VALID_URL, url)
1843         if mobj is None:
1844             self._downloader.report_error(u'invalid url: %s' % url)
1845             return
1846
1847         # Download channel page
1848         channel_id = mobj.group(1)
1849         video_ids = []
1850         pagenum = 1
1851
1852         self.report_download_page(channel_id, pagenum)
1853         url = self._TEMPLATE_URL % (channel_id, pagenum)
1854         request = compat_urllib_request.Request(url)
1855         try:
1856             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1857         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1859             return
1860
1861         # Extract video identifiers
1862         ids_in_page = self.extract_videos_from_page(page)
1863         video_ids.extend(ids_in_page)
1864
1865         # Download any subsequent channel pages using the json-based channel_ajax query
1866         if self._MORE_PAGES_INDICATOR in page:
1867             while True:
1868                 pagenum = pagenum + 1
1869
1870                 self.report_download_page(channel_id, pagenum)
1871                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1872                 request = compat_urllib_request.Request(url)
1873                 try:
1874                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1875                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1876                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1877                     return
1878
1879                 page = json.loads(page)
1880
1881                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1882                 video_ids.extend(ids_in_page)
1883
1884                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1885                     break
1886
1887         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1888
1889         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1890         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1891         return [self.playlist_result(url_entries, channel_id)]
1892
1893
1894 class YoutubeUserIE(InfoExtractor):
1895     """Information Extractor for YouTube users."""
1896
1897     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1898     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1899     _GDATA_PAGE_SIZE = 50
1900     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1901     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1902     IE_NAME = u'youtube:user'
1903
1904     def __init__(self, downloader=None):
1905         InfoExtractor.__init__(self, downloader)
1906
1907     def report_download_page(self, username, start_index):
1908         """Report attempt to download user page."""
1909         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1910                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1911
1912     def _real_extract(self, url):
1913         # Extract username
1914         mobj = re.match(self._VALID_URL, url)
1915         if mobj is None:
1916             self._downloader.report_error(u'invalid url: %s' % url)
1917             return
1918
1919         username = mobj.group(1)
1920
1921         # Download video ids using YouTube Data API. Result size per
1922         # query is limited (currently to 50 videos) so we need to query
1923         # page by page until there are no video ids - it means we got
1924         # all of them.
1925
1926         video_ids = []
1927         pagenum = 0
1928
1929         while True:
1930             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1931             self.report_download_page(username, start_index)
1932
1933             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1934
1935             try:
1936                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1937             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1939                 return
1940
1941             # Extract video identifiers
1942             ids_in_page = []
1943
1944             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1945                 if mobj.group(1) not in ids_in_page:
1946                     ids_in_page.append(mobj.group(1))
1947
1948             video_ids.extend(ids_in_page)
1949
1950             # A little optimization - if current page is not
1951             # "full", ie. does not contain PAGE_SIZE video ids then
1952             # we can assume that this page is the last one - there
1953             # are no more ids on further pages - no need to query
1954             # again.
1955
1956             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1957                 break
1958
1959             pagenum += 1
1960
1961         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1962         url_results = [self.url_result(url, 'Youtube') for url in urls]
1963         return [self.playlist_result(url_results, playlist_title = username)]
1964
1965
1966 class BlipTVUserIE(InfoExtractor):
1967     """Information Extractor for blip.tv users."""
1968
1969     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1970     _PAGE_SIZE = 12
1971     IE_NAME = u'blip.tv:user'
1972
1973     def __init__(self, downloader=None):
1974         InfoExtractor.__init__(self, downloader)
1975
1976     def report_download_page(self, username, pagenum):
1977         """Report attempt to download user page."""
1978         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1979                 (self.IE_NAME, username, pagenum))
1980
1981     def _real_extract(self, url):
1982         # Extract username
1983         mobj = re.match(self._VALID_URL, url)
1984         if mobj is None:
1985             self._downloader.report_error(u'invalid url: %s' % url)
1986             return
1987
1988         username = mobj.group(1)
1989
1990         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1991
1992         request = compat_urllib_request.Request(url)
1993
1994         try:
1995             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1996             mobj = re.search(r'data-users-id="([^"]+)"', page)
1997             page_base = page_base % mobj.group(1)
1998         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2000             return
2001
2002
2003         # Download video ids using BlipTV Ajax calls. Result size per
2004         # query is limited (currently to 12 videos) so we need to query
2005         # page by page until there are no video ids - it means we got
2006         # all of them.
2007
2008         video_ids = []
2009         pagenum = 1
2010
2011         while True:
2012             self.report_download_page(username, pagenum)
2013             url = page_base + "&page=" + str(pagenum)
2014             request = compat_urllib_request.Request( url )
2015             try:
2016                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2017             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2018                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2019                 return
2020
2021             # Extract video identifiers
2022             ids_in_page = []
2023
2024             for mobj in re.finditer(r'href="/([^"]+)"', page):
2025                 if mobj.group(1) not in ids_in_page:
2026                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2027
2028             video_ids.extend(ids_in_page)
2029
2030             # A little optimization - if current page is not
2031             # "full", ie. does not contain PAGE_SIZE video ids then
2032             # we can assume that this page is the last one - there
2033             # are no more ids on further pages - no need to query
2034             # again.
2035
2036             if len(ids_in_page) < self._PAGE_SIZE:
2037                 break
2038
2039             pagenum += 1
2040
2041         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2042         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2043         return [self.playlist_result(url_entries, playlist_title = username)]
2044
2045
2046 class DepositFilesIE(InfoExtractor):
2047     """Information extractor for depositfiles.com"""
2048
2049     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2050
2051     def report_download_webpage(self, file_id):
2052         """Report webpage download."""
2053         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2054
2055     def report_extraction(self, file_id):
2056         """Report information extraction."""
2057         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2058
2059     def _real_extract(self, url):
2060         file_id = url.split('/')[-1]
2061         # Rebuild url in english locale
2062         url = 'http://depositfiles.com/en/files/' + file_id
2063
2064         # Retrieve file webpage with 'Free download' button pressed
2065         free_download_indication = { 'gateway_result' : '1' }
2066         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2067         try:
2068             self.report_download_webpage(file_id)
2069             webpage = compat_urllib_request.urlopen(request).read()
2070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2072             return
2073
2074         # Search for the real file URL
2075         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2076         if (mobj is None) or (mobj.group(1) is None):
2077             # Try to figure out reason of the error.
2078             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2079             if (mobj is not None) and (mobj.group(1) is not None):
2080                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2081                 self._downloader.report_error(u'%s' % restriction_message)
2082             else:
2083                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2084             return
2085
2086         file_url = mobj.group(1)
2087         file_extension = os.path.splitext(file_url)[1][1:]
2088
2089         # Search for file title
2090         mobj = re.search(r'<b title="(.*?)">', webpage)
2091         if mobj is None:
2092             self._downloader.report_error(u'unable to extract title')
2093             return
2094         file_title = mobj.group(1).decode('utf-8')
2095
2096         return [{
2097             'id':       file_id.decode('utf-8'),
2098             'url':      file_url.decode('utf-8'),
2099             'uploader': None,
2100             'upload_date':  None,
2101             'title':    file_title,
2102             'ext':      file_extension.decode('utf-8'),
2103         }]
2104
2105
2106 class FacebookIE(InfoExtractor):
2107     """Information Extractor for Facebook"""
2108
2109     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2110     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2111     _NETRC_MACHINE = 'facebook'
2112     IE_NAME = u'facebook'
2113
2114     def report_login(self):
2115         """Report attempt to log in."""
2116         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2117
2118     def _real_initialize(self):
2119         if self._downloader is None:
2120             return
2121
2122         useremail = None
2123         password = None
2124         downloader_params = self._downloader.params
2125
2126         # Attempt to use provided username and password or .netrc data
2127         if downloader_params.get('username', None) is not None:
2128             useremail = downloader_params['username']
2129             password = downloader_params['password']
2130         elif downloader_params.get('usenetrc', False):
2131             try:
2132                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2133                 if info is not None:
2134                     useremail = info[0]
2135                     password = info[2]
2136                 else:
2137                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2138             except (IOError, netrc.NetrcParseError) as err:
2139                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2140                 return
2141
2142         if useremail is None:
2143             return
2144
2145         # Log in
2146         login_form = {
2147             'email': useremail,
2148             'pass': password,
2149             'login': 'Log+In'
2150             }
2151         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2152         try:
2153             self.report_login()
2154             login_results = compat_urllib_request.urlopen(request).read()
2155             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2156                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2157                 return
2158         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2159             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2160             return
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._downloader.report_error(u'invalid URL: %s' % url)
2166             return
2167         video_id = mobj.group('ID')
2168
2169         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2170         webpage = self._download_webpage(url, video_id)
2171
2172         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2173         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2174         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2175         if not m:
2176             raise ExtractorError(u'Cannot parse data')
2177         data = dict(json.loads(m.group(1)))
2178         params_raw = compat_urllib_parse.unquote(data['params'])
2179         params = json.loads(params_raw)
2180         video_data = params['video_data'][0]
2181         video_url = video_data.get('hd_src')
2182         if not video_url:
2183             video_url = video_data['sd_src']
2184         if not video_url:
2185             raise ExtractorError(u'Cannot find video URL')
2186         video_duration = int(video_data['video_duration'])
2187         thumbnail = video_data['thumbnail_src']
2188
2189         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2190         if not m:
2191             raise ExtractorError(u'Cannot find title in webpage')
2192         video_title = unescapeHTML(m.group(1))
2193
2194         info = {
2195             'id': video_id,
2196             'title': video_title,
2197             'url': video_url,
2198             'ext': 'mp4',
2199             'duration': video_duration,
2200             'thumbnail': thumbnail,
2201         }
2202         return [info]
2203
2204
2205 class BlipTVIE(InfoExtractor):
2206     """Information extractor for blip.tv"""
2207
2208     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2209     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2210     IE_NAME = u'blip.tv'
2211
2212     def report_extraction(self, file_id):
2213         """Report information extraction."""
2214         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2215
2216     def report_direct_download(self, title):
2217         """Report information extraction."""
2218         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2219
2220     def _real_extract(self, url):
2221         mobj = re.match(self._VALID_URL, url)
2222         if mobj is None:
2223             self._downloader.report_error(u'invalid URL: %s' % url)
2224             return
2225
2226         urlp = compat_urllib_parse_urlparse(url)
2227         if urlp.path.startswith('/play/'):
2228             request = compat_urllib_request.Request(url)
2229             response = compat_urllib_request.urlopen(request)
2230             redirecturl = response.geturl()
2231             rurlp = compat_urllib_parse_urlparse(redirecturl)
2232             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2233             url = 'http://blip.tv/a/a-' + file_id
2234             return self._real_extract(url)
2235
2236
2237         if '?' in url:
2238             cchar = '&'
2239         else:
2240             cchar = '?'
2241         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2242         request = compat_urllib_request.Request(json_url)
2243         request.add_header('User-Agent', 'iTunes/10.6.1')
2244         self.report_extraction(mobj.group(1))
2245         info = None
2246         try:
2247             urlh = compat_urllib_request.urlopen(request)
2248             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2249                 basename = url.split('/')[-1]
2250                 title,ext = os.path.splitext(basename)
2251                 title = title.decode('UTF-8')
2252                 ext = ext.replace('.', '')
2253                 self.report_direct_download(title)
2254                 info = {
2255                     'id': title,
2256                     'url': url,
2257                     'uploader': None,
2258                     'upload_date': None,
2259                     'title': title,
2260                     'ext': ext,
2261                     'urlhandle': urlh
2262                 }
2263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2264             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2265         if info is None: # Regular URL
2266             try:
2267                 json_code_bytes = urlh.read()
2268                 json_code = json_code_bytes.decode('utf-8')
2269             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2270                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2271                 return
2272
2273             try:
2274                 json_data = json.loads(json_code)
2275                 if 'Post' in json_data:
2276                     data = json_data['Post']
2277                 else:
2278                     data = json_data
2279
2280                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2281                 video_url = data['media']['url']
2282                 umobj = re.match(self._URL_EXT, video_url)
2283                 if umobj is None:
2284                     raise ValueError('Can not determine filename extension')
2285                 ext = umobj.group(1)
2286
2287                 info = {
2288                     'id': data['item_id'],
2289                     'url': video_url,
2290                     'uploader': data['display_name'],
2291                     'upload_date': upload_date,
2292                     'title': data['title'],
2293                     'ext': ext,
2294                     'format': data['media']['mimeType'],
2295                     'thumbnail': data['thumbnailUrl'],
2296                     'description': data['description'],
2297                     'player_url': data['embedUrl'],
2298                     'user_agent': 'iTunes/10.6.1',
2299                 }
2300             except (ValueError,KeyError) as err:
2301                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2302                 return
2303
2304         return [info]
2305
2306
2307 class MyVideoIE(InfoExtractor):
2308     """Information Extractor for myvideo.de."""
2309
2310     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2311     IE_NAME = u'myvideo'
2312
2313     def __init__(self, downloader=None):
2314         InfoExtractor.__init__(self, downloader)
2315
2316     def report_extraction(self, video_id):
2317         """Report information extraction."""
2318         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2319
2320     def _real_extract(self,url):
2321         mobj = re.match(self._VALID_URL, url)
2322         if mobj is None:
2323             self._download.report_error(u'invalid URL: %s' % url)
2324             return
2325
2326         video_id = mobj.group(1)
2327
2328         # Get video webpage
2329         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2330         webpage = self._download_webpage(webpage_url, video_id)
2331
2332         self.report_extraction(video_id)
2333         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2334                  webpage)
2335         if mobj is None:
2336             self._downloader.report_error(u'unable to extract media URL')
2337             return
2338         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2339
2340         mobj = re.search('<title>([^<]+)</title>', webpage)
2341         if mobj is None:
2342             self._downloader.report_error(u'unable to extract title')
2343             return
2344
2345         video_title = mobj.group(1)
2346
2347         return [{
2348             'id':       video_id,
2349             'url':      video_url,
2350             'uploader': None,
2351             'upload_date':  None,
2352             'title':    video_title,
2353             'ext':      u'flv',
2354         }]
2355
2356 class ComedyCentralIE(InfoExtractor):
2357     """Information extractor for The Daily Show and Colbert Report """
2358
2359     # urls can be abbreviations like :thedailyshow or :colbert
2360     # urls for episodes like:
2361     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2362     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2363     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2364     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2365                       |(https?://)?(www\.)?
2366                           (?P<showname>thedailyshow|colbertnation)\.com/
2367                          (full-episodes/(?P<episode>.*)|
2368                           (?P<clip>
2369                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2370                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2371                      $"""
2372
2373     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2374
2375     _video_extensions = {
2376         '3500': 'mp4',
2377         '2200': 'mp4',
2378         '1700': 'mp4',
2379         '1200': 'mp4',
2380         '750': 'mp4',
2381         '400': 'mp4',
2382     }
2383     _video_dimensions = {
2384         '3500': '1280x720',
2385         '2200': '960x540',
2386         '1700': '768x432',
2387         '1200': '640x360',
2388         '750': '512x288',
2389         '400': '384x216',
2390     }
2391
2392     @classmethod
2393     def suitable(cls, url):
2394         """Receives a URL and returns True if suitable for this IE."""
2395         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2396
2397     def report_extraction(self, episode_id):
2398         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2399
2400     def report_config_download(self, episode_id, media_id):
2401         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2402
2403     def report_index_download(self, episode_id):
2404         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2405
2406     def _print_formats(self, formats):
2407         print('Available formats:')
2408         for x in formats:
2409             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2410
2411
2412     def _real_extract(self, url):
2413         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414         if mobj is None:
2415             self._downloader.report_error(u'invalid URL: %s' % url)
2416             return
2417
2418         if mobj.group('shortname'):
2419             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2420                 url = u'http://www.thedailyshow.com/full-episodes/'
2421             else:
2422                 url = u'http://www.colbertnation.com/full-episodes/'
2423             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2424             assert mobj is not None
2425
2426         if mobj.group('clip'):
2427             if mobj.group('showname') == 'thedailyshow':
2428                 epTitle = mobj.group('tdstitle')
2429             else:
2430                 epTitle = mobj.group('cntitle')
2431             dlNewest = False
2432         else:
2433             dlNewest = not mobj.group('episode')
2434             if dlNewest:
2435                 epTitle = mobj.group('showname')
2436             else:
2437                 epTitle = mobj.group('episode')
2438
2439         req = compat_urllib_request.Request(url)
2440         self.report_extraction(epTitle)
2441         try:
2442             htmlHandle = compat_urllib_request.urlopen(req)
2443             html = htmlHandle.read()
2444             webpage = html.decode('utf-8')
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2447             return
2448         if dlNewest:
2449             url = htmlHandle.geturl()
2450             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2451             if mobj is None:
2452                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2453                 return
2454             if mobj.group('episode') == '':
2455                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2456                 return
2457             epTitle = mobj.group('episode')
2458
2459         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2460
2461         if len(mMovieParams) == 0:
2462             # The Colbert Report embeds the information in a without
2463             # a URL prefix; so extract the alternate reference
2464             # and then add the URL prefix manually.
2465
2466             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2467             if len(altMovieParams) == 0:
2468                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2469                 return
2470             else:
2471                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2472
2473         uri = mMovieParams[0][1]
2474         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2475         self.report_index_download(epTitle)
2476         try:
2477             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2478         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2479             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2480             return
2481
2482         results = []
2483
2484         idoc = xml.etree.ElementTree.fromstring(indexXml)
2485         itemEls = idoc.findall('.//item')
2486         for partNum,itemEl in enumerate(itemEls):
2487             mediaId = itemEl.findall('./guid')[0].text
2488             shortMediaId = mediaId.split(':')[-1]
2489             showId = mediaId.split(':')[-2].replace('.com', '')
2490             officialTitle = itemEl.findall('./title')[0].text
2491             officialDate = itemEl.findall('./pubDate')[0].text
2492
2493             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2494                         compat_urllib_parse.urlencode({'uri': mediaId}))
2495             configReq = compat_urllib_request.Request(configUrl)
2496             self.report_config_download(epTitle, shortMediaId)
2497             try:
2498                 configXml = compat_urllib_request.urlopen(configReq).read()
2499             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2500                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2501                 return
2502
2503             cdoc = xml.etree.ElementTree.fromstring(configXml)
2504             turls = []
2505             for rendition in cdoc.findall('.//rendition'):
2506                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2507                 turls.append(finfo)
2508
2509             if len(turls) == 0:
2510                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2511                 continue
2512
2513             if self._downloader.params.get('listformats', None):
2514                 self._print_formats([i[0] for i in turls])
2515                 return
2516
2517             # For now, just pick the highest bitrate
2518             format,rtmp_video_url = turls[-1]
2519
2520             # Get the format arg from the arg stream
2521             req_format = self._downloader.params.get('format', None)
2522
2523             # Select format if we can find one
2524             for f,v in turls:
2525                 if f == req_format:
2526                     format, rtmp_video_url = f, v
2527                     break
2528
2529             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2530             if not m:
2531                 raise ExtractorError(u'Cannot transform RTMP url')
2532             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2533             video_url = base + m.group('finalid')
2534
2535             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2536             info = {
2537                 'id': shortMediaId,
2538                 'url': video_url,
2539                 'uploader': showId,
2540                 'upload_date': officialDate,
2541                 'title': effTitle,
2542                 'ext': 'mp4',
2543                 'format': format,
2544                 'thumbnail': None,
2545                 'description': officialTitle,
2546             }
2547             results.append(info)
2548
2549         return results
2550
2551
2552 class EscapistIE(InfoExtractor):
2553     """Information extractor for The Escapist """
2554
2555     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2556     IE_NAME = u'escapist'
2557
2558     def report_extraction(self, showName):
2559         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2560
2561     def report_config_download(self, showName):
2562         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2563
2564     def _real_extract(self, url):
2565         mobj = re.match(self._VALID_URL, url)
2566         if mobj is None:
2567             self._downloader.report_error(u'invalid URL: %s' % url)
2568             return
2569         showName = mobj.group('showname')
2570         videoId = mobj.group('episode')
2571
2572         self.report_extraction(showName)
2573         try:
2574             webPage = compat_urllib_request.urlopen(url)
2575             webPageBytes = webPage.read()
2576             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2577             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2578         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2580             return
2581
2582         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2583         description = unescapeHTML(descMatch.group(1))
2584         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2585         imgUrl = unescapeHTML(imgMatch.group(1))
2586         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2587         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2588         configUrlMatch = re.search('config=(.*)$', playerUrl)
2589         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2590
2591         self.report_config_download(showName)
2592         try:
2593             configJSON = compat_urllib_request.urlopen(configUrl)
2594             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2595             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2596         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2598             return
2599
2600         # Technically, it's JavaScript, not JSON
2601         configJSON = configJSON.replace("'", '"')
2602
2603         try:
2604             config = json.loads(configJSON)
2605         except (ValueError,) as err:
2606             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2607             return
2608
2609         playlist = config['playlist']
2610         videoUrl = playlist[1]['url']
2611
2612         info = {
2613             'id': videoId,
2614             'url': videoUrl,
2615             'uploader': showName,
2616             'upload_date': None,
2617             'title': showName,
2618             'ext': 'mp4',
2619             'thumbnail': imgUrl,
2620             'description': description,
2621             'player_url': playerUrl,
2622         }
2623
2624         return [info]
2625
2626 class CollegeHumorIE(InfoExtractor):
2627     """Information extractor for collegehumor.com"""
2628
2629     _WORKING = False
2630     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2631     IE_NAME = u'collegehumor'
2632
2633     def report_manifest(self, video_id):
2634         """Report information extraction."""
2635         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2636
2637     def report_extraction(self, video_id):
2638         """Report information extraction."""
2639         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2640
2641     def _real_extract(self, url):
2642         mobj = re.match(self._VALID_URL, url)
2643         if mobj is None:
2644             self._downloader.report_error(u'invalid URL: %s' % url)
2645             return
2646         video_id = mobj.group('videoid')
2647
2648         info = {
2649             'id': video_id,
2650             'uploader': None,
2651             'upload_date': None,
2652         }
2653
2654         self.report_extraction(video_id)
2655         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2656         try:
2657             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2658         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2660             return
2661
2662         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2663         try:
2664             videoNode = mdoc.findall('./video')[0]
2665             info['description'] = videoNode.findall('./description')[0].text
2666             info['title'] = videoNode.findall('./caption')[0].text
2667             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2668             manifest_url = videoNode.findall('./file')[0].text
2669         except IndexError:
2670             self._downloader.report_error(u'Invalid metadata XML file')
2671             return
2672
2673         manifest_url += '?hdcore=2.10.3'
2674         self.report_manifest(video_id)
2675         try:
2676             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2677         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2678             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2679             return
2680
2681         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2682         try:
2683             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2684             node_id = media_node.attrib['url']
2685             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2686         except IndexError as err:
2687             self._downloader.report_error(u'Invalid manifest file')
2688             return
2689
2690         url_pr = compat_urllib_parse_urlparse(manifest_url)
2691         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2692
2693         info['url'] = url
2694         info['ext'] = 'f4f'
2695         return [info]
2696
2697
2698 class XVideosIE(InfoExtractor):
2699     """Information extractor for xvideos.com"""
2700
2701     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2702     IE_NAME = u'xvideos'
2703
2704     def report_extraction(self, video_id):
2705         """Report information extraction."""
2706         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             self._downloader.report_error(u'invalid URL: %s' % url)
2712             return
2713         video_id = mobj.group(1)
2714
2715         webpage = self._download_webpage(url, video_id)
2716
2717         self.report_extraction(video_id)
2718
2719
2720         # Extract video URL
2721         mobj = re.search(r'flv_url=(.+?)&', webpage)
2722         if mobj is None:
2723             self._downloader.report_error(u'unable to extract video url')
2724             return
2725         video_url = compat_urllib_parse.unquote(mobj.group(1))
2726
2727
2728         # Extract title
2729         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2730         if mobj is None:
2731             self._downloader.report_error(u'unable to extract video title')
2732             return
2733         video_title = mobj.group(1)
2734
2735
2736         # Extract video thumbnail
2737         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2738         if mobj is None:
2739             self._downloader.report_error(u'unable to extract video thumbnail')
2740             return
2741         video_thumbnail = mobj.group(0)
2742
2743         info = {
2744             'id': video_id,
2745             'url': video_url,
2746             'uploader': None,
2747             'upload_date': None,
2748             'title': video_title,
2749             'ext': 'flv',
2750             'thumbnail': video_thumbnail,
2751             'description': None,
2752         }
2753
2754         return [info]
2755
2756
2757 class SoundcloudIE(InfoExtractor):
2758     """Information extractor for soundcloud.com
2759        To access the media, the uid of the song and a stream token
2760        must be extracted from the page source and the script must make
2761        a request to media.soundcloud.com/crossdomain.xml. Then
2762        the media can be grabbed by requesting from an url composed
2763        of the stream token and uid
2764      """
2765
2766     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2767     IE_NAME = u'soundcloud'
2768
2769     def __init__(self, downloader=None):
2770         InfoExtractor.__init__(self, downloader)
2771
2772     def report_resolve(self, video_id):
2773         """Report information extraction."""
2774         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2775
2776     def report_extraction(self, video_id):
2777         """Report information extraction."""
2778         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2779
2780     def _real_extract(self, url):
2781         mobj = re.match(self._VALID_URL, url)
2782         if mobj is None:
2783             self._downloader.report_error(u'invalid URL: %s' % url)
2784             return
2785
2786         # extract uploader (which is in the url)
2787         uploader = mobj.group(1)
2788         # extract simple title (uploader + slug of song title)
2789         slug_title =  mobj.group(2)
2790         simple_title = uploader + u'-' + slug_title
2791
2792         self.report_resolve('%s/%s' % (uploader, slug_title))
2793
2794         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2795         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2796         request = compat_urllib_request.Request(resolv_url)
2797         try:
2798             info_json_bytes = compat_urllib_request.urlopen(request).read()
2799             info_json = info_json_bytes.decode('utf-8')
2800         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2802             return
2803
2804         info = json.loads(info_json)
2805         video_id = info['id']
2806         self.report_extraction('%s/%s' % (uploader, slug_title))
2807
2808         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2809         request = compat_urllib_request.Request(streams_url)
2810         try:
2811             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2812             stream_json = stream_json_bytes.decode('utf-8')
2813         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2814             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2815             return
2816
2817         streams = json.loads(stream_json)
2818         mediaURL = streams['http_mp3_128_url']
2819
2820         return [{
2821             'id':       info['id'],
2822             'url':      mediaURL,
2823             'uploader': info['user']['username'],
2824             'upload_date':  info['created_at'],
2825             'title':    info['title'],
2826             'ext':      u'mp3',
2827             'description': info['description'],
2828         }]
2829
2830 class SoundcloudSetIE(InfoExtractor):
2831     """Information extractor for soundcloud.com sets
2832        To access the media, the uid of the song and a stream token
2833        must be extracted from the page source and the script must make
2834        a request to media.soundcloud.com/crossdomain.xml. Then
2835        the media can be grabbed by requesting from an url composed
2836        of the stream token and uid
2837      """
2838
2839     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2840     IE_NAME = u'soundcloud'
2841
2842     def __init__(self, downloader=None):
2843         InfoExtractor.__init__(self, downloader)
2844
2845     def report_resolve(self, video_id):
2846         """Report information extraction."""
2847         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2848
2849     def report_extraction(self, video_id):
2850         """Report information extraction."""
2851         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2852
2853     def _real_extract(self, url):
2854         mobj = re.match(self._VALID_URL, url)
2855         if mobj is None:
2856             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2857             return
2858
2859         # extract uploader (which is in the url)
2860         uploader = mobj.group(1)
2861         # extract simple title (uploader + slug of song title)
2862         slug_title =  mobj.group(2)
2863         simple_title = uploader + u'-' + slug_title
2864
2865         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2866
2867         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2868         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2869         request = compat_urllib_request.Request(resolv_url)
2870         try:
2871             info_json_bytes = compat_urllib_request.urlopen(request).read()
2872             info_json = info_json_bytes.decode('utf-8')
2873         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2874             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2875             return
2876
2877         videos = []
2878         info = json.loads(info_json)
2879         if 'errors' in info:
2880             for err in info['errors']:
2881                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2882             return
2883
2884         for track in info['tracks']:
2885             video_id = track['id']
2886             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2887
2888             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2889             request = compat_urllib_request.Request(streams_url)
2890             try:
2891                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2892                 stream_json = stream_json_bytes.decode('utf-8')
2893             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2895                 return
2896
2897             streams = json.loads(stream_json)
2898             mediaURL = streams['http_mp3_128_url']
2899
2900             videos.append({
2901                 'id':       video_id,
2902                 'url':      mediaURL,
2903                 'uploader': track['user']['username'],
2904                 'upload_date':  track['created_at'],
2905                 'title':    track['title'],
2906                 'ext':      u'mp3',
2907                 'description': track['description'],
2908             })
2909         return videos
2910
2911
2912 class InfoQIE(InfoExtractor):
2913     """Information extractor for infoq.com"""
2914     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2915
2916     def report_extraction(self, video_id):
2917         """Report information extraction."""
2918         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2919
2920     def _real_extract(self, url):
2921         mobj = re.match(self._VALID_URL, url)
2922         if mobj is None:
2923             self._downloader.report_error(u'invalid URL: %s' % url)
2924             return
2925
2926         webpage = self._download_webpage(url, video_id=url)
2927         self.report_extraction(url)
2928
2929         # Extract video URL
2930         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2931         if mobj is None:
2932             self._downloader.report_error(u'unable to extract video url')
2933             return
2934         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2935         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2936
2937         # Extract title
2938         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2939         if mobj is None:
2940             self._downloader.report_error(u'unable to extract video title')
2941             return
2942         video_title = mobj.group(1)
2943
2944         # Extract description
2945         video_description = u'No description available.'
2946         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2947         if mobj is not None:
2948             video_description = mobj.group(1)
2949
2950         video_filename = video_url.split('/')[-1]
2951         video_id, extension = video_filename.split('.')
2952
2953         info = {
2954             'id': video_id,
2955             'url': video_url,
2956             'uploader': None,
2957             'upload_date': None,
2958             'title': video_title,
2959             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2960             'thumbnail': None,
2961             'description': video_description,
2962         }
2963
2964         return [info]
2965
2966 class MixcloudIE(InfoExtractor):
2967     """Information extractor for www.mixcloud.com"""
2968
2969     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2970     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2971     IE_NAME = u'mixcloud'
2972
2973     def __init__(self, downloader=None):
2974         InfoExtractor.__init__(self, downloader)
2975
2976     def report_download_json(self, file_id):
2977         """Report JSON download."""
2978         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2979
2980     def report_extraction(self, file_id):
2981         """Report information extraction."""
2982         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2983
2984     def get_urls(self, jsonData, fmt, bitrate='best'):
2985         """Get urls from 'audio_formats' section in json"""
2986         file_url = None
2987         try:
2988             bitrate_list = jsonData[fmt]
2989             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2990                 bitrate = max(bitrate_list) # select highest
2991
2992             url_list = jsonData[fmt][bitrate]
2993         except TypeError: # we have no bitrate info.
2994             url_list = jsonData[fmt]
2995         return url_list
2996
2997     def check_urls(self, url_list):
2998         """Returns 1st active url from list"""
2999         for url in url_list:
3000             try:
3001                 compat_urllib_request.urlopen(url)
3002                 return url
3003             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004                 url = None
3005
3006         return None
3007
3008     def _print_formats(self, formats):
3009         print('Available formats:')
3010         for fmt in formats.keys():
3011             for b in formats[fmt]:
3012                 try:
3013                     ext = formats[fmt][b][0]
3014                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3015                 except TypeError: # we have no bitrate info
3016                     ext = formats[fmt][0]
3017                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3018                     break
3019
3020     def _real_extract(self, url):
3021         mobj = re.match(self._VALID_URL, url)
3022         if mobj is None:
3023             self._downloader.report_error(u'invalid URL: %s' % url)
3024             return
3025         # extract uploader & filename from url
3026         uploader = mobj.group(1).decode('utf-8')
3027         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3028
3029         # construct API request
3030         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3031         # retrieve .json file with links to files
3032         request = compat_urllib_request.Request(file_url)
3033         try:
3034             self.report_download_json(file_url)
3035             jsonData = compat_urllib_request.urlopen(request).read()
3036         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3038             return
3039
3040         # parse JSON
3041         json_data = json.loads(jsonData)
3042         player_url = json_data['player_swf_url']
3043         formats = dict(json_data['audio_formats'])
3044
3045         req_format = self._downloader.params.get('format', None)
3046         bitrate = None
3047
3048         if self._downloader.params.get('listformats', None):
3049             self._print_formats(formats)
3050             return
3051
3052         if req_format is None or req_format == 'best':
3053             for format_param in formats.keys():
3054                 url_list = self.get_urls(formats, format_param)
3055                 # check urls
3056                 file_url = self.check_urls(url_list)
3057                 if file_url is not None:
3058                     break # got it!
3059         else:
3060             if req_format not in formats:
3061                 self._downloader.report_error(u'format is not available')
3062                 return
3063
3064             url_list = self.get_urls(formats, req_format)
3065             file_url = self.check_urls(url_list)
3066             format_param = req_format
3067
3068         return [{
3069             'id': file_id.decode('utf-8'),
3070             'url': file_url.decode('utf-8'),
3071             'uploader': uploader.decode('utf-8'),
3072             'upload_date': None,
3073             'title': json_data['name'],
3074             'ext': file_url.split('.')[-1].decode('utf-8'),
3075             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3076             'thumbnail': json_data['thumbnail_url'],
3077             'description': json_data['description'],
3078             'player_url': player_url.decode('utf-8'),
3079         }]
3080
3081 class StanfordOpenClassroomIE(InfoExtractor):
3082     """Information extractor for Stanford's Open ClassRoom"""
3083
3084     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3085     IE_NAME = u'stanfordoc'
3086
3087     def report_download_webpage(self, objid):
3088         """Report information extraction."""
3089         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3090
3091     def report_extraction(self, video_id):
3092         """Report information extraction."""
3093         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3094
3095     def _real_extract(self, url):
3096         mobj = re.match(self._VALID_URL, url)
3097         if mobj is None:
3098             raise ExtractorError(u'Invalid URL: %s' % url)
3099
3100         if mobj.group('course') and mobj.group('video'): # A specific video
3101             course = mobj.group('course')
3102             video = mobj.group('video')
3103             info = {
3104                 'id': course + '_' + video,
3105                 'uploader': None,
3106                 'upload_date': None,
3107             }
3108
3109             self.report_extraction(info['id'])
3110             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3111             xmlUrl = baseUrl + video + '.xml'
3112             try:
3113                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3114             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3116                 return
3117             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3118             try:
3119                 info['title'] = mdoc.findall('./title')[0].text
3120                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3121             except IndexError:
3122                 self._downloader.report_error(u'Invalid metadata XML file')
3123                 return
3124             info['ext'] = info['url'].rpartition('.')[2]
3125             return [info]
3126         elif mobj.group('course'): # A course page
3127             course = mobj.group('course')
3128             info = {
3129                 'id': course,
3130                 'type': 'playlist',
3131                 'uploader': None,
3132                 'upload_date': None,
3133             }
3134
3135             coursepage = self._download_webpage(url, info['id'],
3136                                         note='Downloading course info page',
3137                                         errnote='Unable to download course info page')
3138
3139             m = re.search('<h1>([^<]+)</h1>', coursepage)
3140             if m:
3141                 info['title'] = unescapeHTML(m.group(1))
3142             else:
3143                 info['title'] = info['id']
3144
3145             m = re.search('<description>([^<]+)</description>', coursepage)
3146             if m:
3147                 info['description'] = unescapeHTML(m.group(1))
3148
3149             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3150             info['list'] = [
3151                 {
3152                     'type': 'reference',
3153                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3154                 }
3155                     for vpage in links]
3156             results = []
3157             for entry in info['list']:
3158                 assert entry['type'] == 'reference'
3159                 results += self.extract(entry['url'])
3160             return results
3161         else: # Root page
3162             info = {
3163                 'id': 'Stanford OpenClassroom',
3164                 'type': 'playlist',
3165                 'uploader': None,
3166                 'upload_date': None,
3167             }
3168
3169             self.report_download_webpage(info['id'])
3170             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3171             try:
3172                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3173             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3175                 return
3176
3177             info['title'] = info['id']
3178
3179             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3180             info['list'] = [
3181                 {
3182                     'type': 'reference',
3183                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3184                 }
3185                     for cpage in links]
3186
3187             results = []
3188             for entry in info['list']:
3189                 assert entry['type'] == 'reference'
3190                 results += self.extract(entry['url'])
3191             return results
3192
3193 class MTVIE(InfoExtractor):
3194     """Information extractor for MTV.com"""
3195
3196     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3197     IE_NAME = u'mtv'
3198
3199     def report_extraction(self, video_id):
3200         """Report information extraction."""
3201         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3202
3203     def _real_extract(self, url):
3204         mobj = re.match(self._VALID_URL, url)
3205         if mobj is None:
3206             self._downloader.report_error(u'invalid URL: %s' % url)
3207             return
3208         if not mobj.group('proto'):
3209             url = 'http://' + url
3210         video_id = mobj.group('videoid')
3211
3212         webpage = self._download_webpage(url, video_id)
3213
3214         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3215         if mobj is None:
3216             self._downloader.report_error(u'unable to extract song name')
3217             return
3218         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3220         if mobj is None:
3221             self._downloader.report_error(u'unable to extract performer')
3222             return
3223         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3224         video_title = performer + ' - ' + song_name
3225
3226         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3227         if mobj is None:
3228             self._downloader.report_error(u'unable to mtvn_uri')
3229             return
3230         mtvn_uri = mobj.group(1)
3231
3232         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3233         if mobj is None:
3234             self._downloader.report_error(u'unable to extract content id')
3235             return
3236         content_id = mobj.group(1)
3237
3238         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3239         self.report_extraction(video_id)
3240         request = compat_urllib_request.Request(videogen_url)
3241         try:
3242             metadataXml = compat_urllib_request.urlopen(request).read()
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3245             return
3246
3247         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3248         renditions = mdoc.findall('.//rendition')
3249
3250         # For now, always pick the highest quality.
3251         rendition = renditions[-1]
3252
3253         try:
3254             _,_,ext = rendition.attrib['type'].partition('/')
3255             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3256             video_url = rendition.find('./src').text
3257         except KeyError:
3258             self._downloader.trouble('Invalid rendition field.')
3259             return
3260
3261         info = {
3262             'id': video_id,
3263             'url': video_url,
3264             'uploader': performer,
3265             'upload_date': None,
3266             'title': video_title,
3267             'ext': ext,
3268             'format': format,
3269         }
3270
3271         return [info]
3272
3273
3274 class YoukuIE(InfoExtractor):
3275     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3276
3277     def report_download_webpage(self, file_id):
3278         """Report webpage download."""
3279         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3280
3281     def report_extraction(self, file_id):
3282         """Report information extraction."""
3283         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3284
3285     def _gen_sid(self):
3286         nowTime = int(time.time() * 1000)
3287         random1 = random.randint(1000,1998)
3288         random2 = random.randint(1000,9999)
3289
3290         return "%d%d%d" %(nowTime,random1,random2)
3291
3292     def _get_file_ID_mix_string(self, seed):
3293         mixed = []
3294         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3295         seed = float(seed)
3296         for i in range(len(source)):
3297             seed  =  (seed * 211 + 30031 ) % 65536
3298             index  =  math.floor(seed / 65536 * len(source) )
3299             mixed.append(source[int(index)])
3300             source.remove(source[int(index)])
3301         #return ''.join(mixed)
3302         return mixed
3303
3304     def _get_file_id(self, fileId, seed):
3305         mixed = self._get_file_ID_mix_string(seed)
3306         ids = fileId.split('*')
3307         realId = []
3308         for ch in ids:
3309             if ch:
3310                 realId.append(mixed[int(ch)])
3311         return ''.join(realId)
3312
3313     def _real_extract(self, url):
3314         mobj = re.match(self._VALID_URL, url)
3315         if mobj is None:
3316             self._downloader.report_error(u'invalid URL: %s' % url)
3317             return
3318         video_id = mobj.group('ID')
3319
3320         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3321
3322         request = compat_urllib_request.Request(info_url, None, std_headers)
3323         try:
3324             self.report_download_webpage(video_id)
3325             jsondata = compat_urllib_request.urlopen(request).read()
3326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3328             return
3329
3330         self.report_extraction(video_id)
3331         try:
3332             jsonstr = jsondata.decode('utf-8')
3333             config = json.loads(jsonstr)
3334
3335             video_title =  config['data'][0]['title']
3336             seed = config['data'][0]['seed']
3337
3338             format = self._downloader.params.get('format', None)
3339             supported_format = list(config['data'][0]['streamfileids'].keys())
3340
3341             if format is None or format == 'best':
3342                 if 'hd2' in supported_format:
3343                     format = 'hd2'
3344                 else:
3345                     format = 'flv'
3346                 ext = u'flv'
3347             elif format == 'worst':
3348                 format = 'mp4'
3349                 ext = u'mp4'
3350             else:
3351                 format = 'flv'
3352                 ext = u'flv'
3353
3354
3355             fileid = config['data'][0]['streamfileids'][format]
3356             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3357         except (UnicodeDecodeError, ValueError, KeyError):
3358             self._downloader.report_error(u'unable to extract info section')
3359             return
3360
3361         files_info=[]
3362         sid = self._gen_sid()
3363         fileid = self._get_file_id(fileid, seed)
3364
3365         #column 8,9 of fileid represent the segment number
3366         #fileid[7:9] should be changed
3367         for index, key in enumerate(keys):
3368
3369             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3370             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3371
3372             info = {
3373                 'id': '%s_part%02d' % (video_id, index),
3374                 'url': download_url,
3375                 'uploader': None,
3376                 'upload_date': None,
3377                 'title': video_title,
3378                 'ext': ext,
3379             }
3380             files_info.append(info)
3381
3382         return files_info
3383
3384
3385 class XNXXIE(InfoExtractor):
3386     """Information extractor for xnxx.com"""
3387
3388     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3389     IE_NAME = u'xnxx'
3390     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3391     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3392     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3393
3394     def report_webpage(self, video_id):
3395         """Report information extraction"""
3396         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3397
3398     def report_extraction(self, video_id):
3399         """Report information extraction"""
3400         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3401
3402     def _real_extract(self, url):
3403         mobj = re.match(self._VALID_URL, url)
3404         if mobj is None:
3405             self._downloader.report_error(u'invalid URL: %s' % url)
3406             return
3407         video_id = mobj.group(1)
3408
3409         self.report_webpage(video_id)
3410
3411         # Get webpage content
3412         try:
3413             webpage_bytes = compat_urllib_request.urlopen(url).read()
3414             webpage = webpage_bytes.decode('utf-8')
3415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3417             return
3418
3419         result = re.search(self.VIDEO_URL_RE, webpage)
3420         if result is None:
3421             self._downloader.report_error(u'unable to extract video url')
3422             return
3423         video_url = compat_urllib_parse.unquote(result.group(1))
3424
3425         result = re.search(self.VIDEO_TITLE_RE, webpage)
3426         if result is None:
3427             self._downloader.report_error(u'unable to extract video title')
3428             return
3429         video_title = result.group(1)
3430
3431         result = re.search(self.VIDEO_THUMB_RE, webpage)
3432         if result is None:
3433             self._downloader.report_error(u'unable to extract video thumbnail')
3434             return
3435         video_thumbnail = result.group(1)
3436
3437         return [{
3438             'id': video_id,
3439             'url': video_url,
3440             'uploader': None,
3441             'upload_date': None,
3442             'title': video_title,
3443             'ext': 'flv',
3444             'thumbnail': video_thumbnail,
3445             'description': None,
3446         }]
3447
3448
3449 class GooglePlusIE(InfoExtractor):
3450     """Information extractor for plus.google.com."""
3451
3452     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3453     IE_NAME = u'plus.google'
3454
3455     def __init__(self, downloader=None):
3456         InfoExtractor.__init__(self, downloader)
3457
3458     def report_extract_entry(self, url):
3459         """Report downloading extry"""
3460         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3461
3462     def report_date(self, upload_date):
3463         """Report downloading extry"""
3464         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3465
3466     def report_uploader(self, uploader):
3467         """Report downloading extry"""
3468         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3469
3470     def report_title(self, video_title):
3471         """Report downloading extry"""
3472         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3473
3474     def report_extract_vid_page(self, video_page):
3475         """Report information extraction."""
3476         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3477
3478     def _real_extract(self, url):
3479         # Extract id from URL
3480         mobj = re.match(self._VALID_URL, url)
3481         if mobj is None:
3482             self._downloader.report_error(u'Invalid URL: %s' % url)
3483             return
3484
3485         post_url = mobj.group(0)
3486         video_id = mobj.group(1)
3487
3488         video_extension = 'flv'
3489
3490         # Step 1, Retrieve post webpage to extract further information
3491         self.report_extract_entry(post_url)
3492         request = compat_urllib_request.Request(post_url)
3493         try:
3494             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3495         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3497             return
3498
3499         # Extract update date
3500         upload_date = None
3501         pattern = 'title="Timestamp">(.*?)</a>'
3502         mobj = re.search(pattern, webpage)
3503         if mobj:
3504             upload_date = mobj.group(1)
3505             # Convert timestring to a format suitable for filename
3506             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507             upload_date = upload_date.strftime('%Y%m%d')
3508         self.report_date(upload_date)
3509
3510         # Extract uploader
3511         uploader = None
3512         pattern = r'rel\="author".*?>(.*?)</a>'
3513         mobj = re.search(pattern, webpage)
3514         if mobj:
3515             uploader = mobj.group(1)
3516         self.report_uploader(uploader)
3517
3518         # Extract title
3519         # Get the first line for title
3520         video_title = u'NA'
3521         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522         mobj = re.search(pattern, webpage)
3523         if mobj:
3524             video_title = mobj.group(1)
3525         self.report_title(video_title)
3526
3527         # Step 2, Stimulate clicking the image box to launch video
3528         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529         mobj = re.search(pattern, webpage)
3530         if mobj is None:
3531             self._downloader.report_error(u'unable to extract video page URL')
3532
3533         video_page = mobj.group(1)
3534         request = compat_urllib_request.Request(video_page)
3535         try:
3536             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3537         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3539             return
3540         self.report_extract_vid_page(video_page)
3541
3542
3543         # Extract video links on video page
3544         """Extract video links of all sizes"""
3545         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546         mobj = re.findall(pattern, webpage)
3547         if len(mobj) == 0:
3548             self._downloader.report_error(u'unable to extract video links')
3549
3550         # Sort in resolution
3551         links = sorted(mobj)
3552
3553         # Choose the lowest of the sort, i.e. highest resolution
3554         video_url = links[-1]
3555         # Only get the url. The resolution part in the tuple has no use anymore
3556         video_url = video_url[-1]
3557         # Treat escaped \u0026 style hex
3558         try:
3559             video_url = video_url.decode("unicode_escape")
3560         except AttributeError: # Python 3
3561             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3562
3563
3564         return [{
3565             'id':       video_id,
3566             'url':      video_url,
3567             'uploader': uploader,
3568             'upload_date':  upload_date,
3569             'title':    video_title,
3570             'ext':      video_extension,
3571         }]
3572
3573 class NBAIE(InfoExtractor):
3574     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3575     IE_NAME = u'nba'
3576
3577     def _real_extract(self, url):
3578         mobj = re.match(self._VALID_URL, url)
3579         if mobj is None:
3580             self._downloader.report_error(u'invalid URL: %s' % url)
3581             return
3582
3583         video_id = mobj.group(1)
3584         if video_id.endswith('/index.html'):
3585             video_id = video_id[:-len('/index.html')]
3586
3587         webpage = self._download_webpage(url, video_id)
3588
3589         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3590         def _findProp(rexp, default=None):
3591             m = re.search(rexp, webpage)
3592             if m:
3593                 return unescapeHTML(m.group(1))
3594             else:
3595                 return default
3596
3597         shortened_video_id = video_id.rpartition('/')[2]
3598         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3599         info = {
3600             'id': shortened_video_id,
3601             'url': video_url,
3602             'ext': 'mp4',
3603             'title': title,
3604             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3605             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3606         }
3607         return [info]
3608
3609 class JustinTVIE(InfoExtractor):
3610     """Information extractor for justin.tv and twitch.tv"""
3611     # TODO: One broadcast may be split into multiple videos. The key
3612     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3613     # starts at 1 and increases. Can we treat all parts as one video?
3614
3615     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3616         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3617     _JUSTIN_PAGE_LIMIT = 100
3618     IE_NAME = u'justin.tv'
3619
3620     def report_extraction(self, file_id):
3621         """Report information extraction."""
3622         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3623
3624     def report_download_page(self, channel, offset):
3625         """Report attempt to download a single page of videos."""
3626         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3627                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3628
3629     # Return count of items, list of *valid* items
3630     def _parse_page(self, url):
3631         try:
3632             urlh = compat_urllib_request.urlopen(url)
3633             webpage_bytes = urlh.read()
3634             webpage = webpage_bytes.decode('utf-8', 'ignore')
3635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3636             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3637             return
3638
3639         response = json.loads(webpage)
3640         if type(response) != list:
3641             error_text = response.get('error', 'unknown error')
3642             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3643             return
3644         info = []
3645         for clip in response:
3646             video_url = clip['video_file_url']
3647             if video_url:
3648                 video_extension = os.path.splitext(video_url)[1][1:]
3649                 video_date = re.sub('-', '', clip['start_time'][:10])
3650                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3651                 video_id = clip['id']
3652                 video_title = clip.get('title', video_id)
3653                 info.append({
3654                     'id': video_id,
3655                     'url': video_url,
3656                     'title': video_title,
3657                     'uploader': clip.get('channel_name', video_uploader_id),
3658                     'uploader_id': video_uploader_id,
3659                     'upload_date': video_date,
3660                     'ext': video_extension,
3661                 })
3662         return (len(response), info)
3663
3664     def _real_extract(self, url):
3665         mobj = re.match(self._VALID_URL, url)
3666         if mobj is None:
3667             self._downloader.report_error(u'invalid URL: %s' % url)
3668             return
3669
3670         api = 'http://api.justin.tv'
3671         video_id = mobj.group(mobj.lastindex)
3672         paged = False
3673         if mobj.lastindex == 1:
3674             paged = True
3675             api += '/channel/archives/%s.json'
3676         else:
3677             api += '/broadcast/by_archive/%s.json'
3678         api = api % (video_id,)
3679
3680         self.report_extraction(video_id)
3681
3682         info = []
3683         offset = 0
3684         limit = self._JUSTIN_PAGE_LIMIT
3685         while True:
3686             if paged:
3687                 self.report_download_page(video_id, offset)
3688             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3689             page_count, page_info = self._parse_page(page_url)
3690             info.extend(page_info)
3691             if not paged or page_count != limit:
3692                 break
3693             offset += limit
3694         return info
3695
3696 class FunnyOrDieIE(InfoExtractor):
3697     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3698
3699     def _real_extract(self, url):
3700         mobj = re.match(self._VALID_URL, url)
3701         if mobj is None:
3702             self._downloader.report_error(u'invalid URL: %s' % url)
3703             return
3704
3705         video_id = mobj.group('id')
3706         webpage = self._download_webpage(url, video_id)
3707
3708         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3709         if not m:
3710             self._downloader.report_error(u'unable to find video information')
3711         video_url = unescapeHTML(m.group('url'))
3712
3713         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3714         if not m:
3715             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3716             if not m:
3717                 self._downloader.trouble(u'Cannot find video title')
3718         title = clean_html(m.group('title'))
3719
3720         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3721         if m:
3722             desc = unescapeHTML(m.group('desc'))
3723         else:
3724             desc = None
3725
3726         info = {
3727             'id': video_id,
3728             'url': video_url,
3729             'ext': 'mp4',
3730             'title': title,
3731             'description': desc,
3732         }
3733         return [info]
3734
3735 class SteamIE(InfoExtractor):
3736     _VALID_URL = r"""http://store.steampowered.com/
3737                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3738                 (?P<gameID>\d+)/?
3739                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3740                 """
3741
3742     @classmethod
3743     def suitable(cls, url):
3744         """Receives a URL and returns True if suitable for this IE."""
3745         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3746
3747     def _real_extract(self, url):
3748         m = re.match(self._VALID_URL, url, re.VERBOSE)
3749         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3750         gameID = m.group('gameID')
3751         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3752         webpage = self._download_webpage(videourl, gameID)
3753         mweb = re.finditer(urlRE, webpage)
3754         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3755         titles = re.finditer(namesRE, webpage)
3756         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3757         thumbs = re.finditer(thumbsRE, webpage)
3758         videos = []
3759         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3760             video_id = vid.group('videoID')
3761             title = vtitle.group('videoName')
3762             video_url = vid.group('videoURL')
3763             video_thumb = thumb.group('thumbnail')
3764             if not video_url:
3765                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3766             info = {
3767                 'id':video_id,
3768                 'url':video_url,
3769                 'ext': 'flv',
3770                 'title': unescapeHTML(title),
3771                 'thumbnail': video_thumb
3772                   }
3773             videos.append(info)
3774         return videos
3775
3776 class UstreamIE(InfoExtractor):
3777     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3778     IE_NAME = u'ustream'
3779
3780     def _real_extract(self, url):
3781         m = re.match(self._VALID_URL, url)
3782         video_id = m.group('videoID')
3783         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3784         webpage = self._download_webpage(url, video_id)
3785         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3786         title = m.group('title')
3787         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3788         uploader = m.group('uploader')
3789         info = {
3790                 'id':video_id,
3791                 'url':video_url,
3792                 'ext': 'flv',
3793                 'title': title,
3794                 'uploader': uploader
3795                   }
3796         return [info]
3797
3798 class WorldStarHipHopIE(InfoExtractor):
3799     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3800     IE_NAME = u'WorldStarHipHop'
3801
3802     def _real_extract(self, url):
3803         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3804
3805         webpage_src = compat_urllib_request.urlopen(url).read()
3806         webpage_src = webpage_src.decode('utf-8')
3807
3808         mobj = re.search(_src_url, webpage_src)
3809
3810         m = re.match(self._VALID_URL, url)
3811         video_id = m.group('id')
3812
3813         if mobj is not None:
3814             video_url = mobj.group()
3815             if 'mp4' in video_url:
3816                 ext = 'mp4'
3817             else:
3818                 ext = 'flv'
3819         else:
3820             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3821             return
3822
3823         _title = r"""<title>(.*)</title>"""
3824
3825         mobj = re.search(_title, webpage_src)
3826
3827         if mobj is not None:
3828             title = mobj.group(1)
3829         else:
3830             title = 'World Start Hip Hop - %s' % time.ctime()
3831
3832         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3833         mobj = re.search(_thumbnail, webpage_src)
3834
3835         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3836         if mobj is not None:
3837             thumbnail = mobj.group(1)
3838         else:
3839             _title = r"""candytitles.*>(.*)</span>"""
3840             mobj = re.search(_title, webpage_src)
3841             if mobj is not None:
3842                 title = mobj.group(1)
3843             thumbnail = None
3844
3845         results = [{
3846                     'id': video_id,
3847                     'url' : video_url,
3848                     'title' : title,
3849                     'thumbnail' : thumbnail,
3850                     'ext' : ext,
3851                     }]
3852         return results
3853
3854 class RBMARadioIE(InfoExtractor):
3855     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3856
3857     def _real_extract(self, url):
3858         m = re.match(self._VALID_URL, url)
3859         video_id = m.group('videoID')
3860
3861         webpage = self._download_webpage(url, video_id)
3862         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3863         if not m:
3864             raise ExtractorError(u'Cannot find metadata')
3865         json_data = m.group(1)
3866
3867         try:
3868             data = json.loads(json_data)
3869         except ValueError as e:
3870             raise ExtractorError(u'Invalid JSON: ' + str(e))
3871
3872         video_url = data['akamai_url'] + '&cbr=256'
3873         url_parts = compat_urllib_parse_urlparse(video_url)
3874         video_ext = url_parts.path.rpartition('.')[2]
3875         info = {
3876                 'id': video_id,
3877                 'url': video_url,
3878                 'ext': video_ext,
3879                 'title': data['title'],
3880                 'description': data.get('teaser_text'),
3881                 'location': data.get('country_of_origin'),
3882                 'uploader': data.get('host', {}).get('name'),
3883                 'uploader_id': data.get('host', {}).get('slug'),
3884                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3885                 'duration': data.get('duration'),
3886         }
3887         return [info]
3888
3889
3890 class YouPornIE(InfoExtractor):
3891     """Information extractor for youporn.com."""
3892     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3893
3894     def _print_formats(self, formats):
3895         """Print all available formats"""
3896         print(u'Available formats:')
3897         print(u'ext\t\tformat')
3898         print(u'---------------------------------')
3899         for format in formats:
3900             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3901
3902     def _specific(self, req_format, formats):
3903         for x in formats:
3904             if(x["format"]==req_format):
3905                 return x
3906         return None
3907
3908     def _real_extract(self, url):
3909         mobj = re.match(self._VALID_URL, url)
3910         if mobj is None:
3911             self._downloader.report_error(u'invalid URL: %s' % url)
3912             return
3913
3914         video_id = mobj.group('videoid')
3915
3916         req = compat_urllib_request.Request(url)
3917         req.add_header('Cookie', 'age_verified=1')
3918         webpage = self._download_webpage(req, video_id)
3919
3920         # Get the video title
3921         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3922         if result is None:
3923             raise ExtractorError(u'Unable to extract video title')
3924         video_title = result.group('title').strip()
3925
3926         # Get the video date
3927         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3928         if result is None:
3929             self._downloader.report_warning(u'unable to extract video date')
3930             upload_date = None
3931         else:
3932             upload_date = result.group('date').strip()
3933
3934         # Get the video uploader
3935         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3936         if result is None:
3937             self._downloader.report_warning(u'unable to extract uploader')
3938             video_uploader = None
3939         else:
3940             video_uploader = result.group('uploader').strip()
3941             video_uploader = clean_html( video_uploader )
3942
3943         # Get all of the formats available
3944         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3945         result = re.search(DOWNLOAD_LIST_RE, webpage)
3946         if result is None:
3947             raise ExtractorError(u'Unable to extract download list')
3948         download_list_html = result.group('download_list').strip()
3949
3950         # Get all of the links from the page
3951         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3952         links = re.findall(LINK_RE, download_list_html)
3953         if(len(links) == 0):
3954             raise ExtractorError(u'ERROR: no known formats available for video')
3955
3956         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3957
3958         formats = []
3959         for link in links:
3960
3961             # A link looks like this:
3962             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3963             # A path looks like this:
3964             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3965             video_url = unescapeHTML( link )
3966             path = compat_urllib_parse_urlparse( video_url ).path
3967             extension = os.path.splitext( path )[1][1:]
3968             format = path.split('/')[4].split('_')[:2]
3969             size = format[0]
3970             bitrate = format[1]
3971             format = "-".join( format )
3972             title = u'%s-%s-%s' % (video_title, size, bitrate)
3973
3974             formats.append({
3975                 'id': video_id,
3976                 'url': video_url,
3977                 'uploader': video_uploader,
3978                 'upload_date': upload_date,
3979                 'title': title,
3980                 'ext': extension,
3981                 'format': format,
3982                 'thumbnail': None,
3983                 'description': None,
3984                 'player_url': None
3985             })
3986
3987         if self._downloader.params.get('listformats', None):
3988             self._print_formats(formats)
3989             return
3990
3991         req_format = self._downloader.params.get('format', None)
3992         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3993
3994         if req_format is None or req_format == 'best':
3995             return [formats[0]]
3996         elif req_format == 'worst':
3997             return [formats[-1]]
3998         elif req_format in ('-1', 'all'):
3999             return formats
4000         else:
4001             format = self._specific( req_format, formats )
4002             if result is None:
4003                 self._downloader.report_error(u'requested format not available')
4004                 return
4005             return [format]
4006
4007
4008
4009 class PornotubeIE(InfoExtractor):
4010     """Information extractor for pornotube.com."""
4011     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4012
4013     def _real_extract(self, url):
4014         mobj = re.match(self._VALID_URL, url)
4015         if mobj is None:
4016             self._downloader.report_error(u'invalid URL: %s' % url)
4017             return
4018
4019         video_id = mobj.group('videoid')
4020         video_title = mobj.group('title')
4021
4022         # Get webpage content
4023         webpage = self._download_webpage(url, video_id)
4024
4025         # Get the video URL
4026         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4027         result = re.search(VIDEO_URL_RE, webpage)
4028         if result is None:
4029             self._downloader.report_error(u'unable to extract video url')
4030             return
4031         video_url = compat_urllib_parse.unquote(result.group('url'))
4032
4033         #Get the uploaded date
4034         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4035         result = re.search(VIDEO_UPLOADED_RE, webpage)
4036         if result is None:
4037             self._downloader.report_error(u'unable to extract video title')
4038             return
4039         upload_date = result.group('date')
4040
4041         info = {'id': video_id,
4042                 'url': video_url,
4043                 'uploader': None,
4044                 'upload_date': upload_date,
4045                 'title': video_title,
4046                 'ext': 'flv',
4047                 'format': 'flv'}
4048
4049         return [info]
4050
4051 class YouJizzIE(InfoExtractor):
4052     """Information extractor for youjizz.com."""
4053     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4054
4055     def _real_extract(self, url):
4056         mobj = re.match(self._VALID_URL, url)
4057         if mobj is None:
4058             self._downloader.report_error(u'invalid URL: %s' % url)
4059             return
4060
4061         video_id = mobj.group('videoid')
4062
4063         # Get webpage content
4064         webpage = self._download_webpage(url, video_id)
4065
4066         # Get the video title
4067         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4068         if result is None:
4069             raise ExtractorError(u'ERROR: unable to extract video title')
4070         video_title = result.group('title').strip()
4071
4072         # Get the embed page
4073         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4074         if result is None:
4075             raise ExtractorError(u'ERROR: unable to extract embed page')
4076
4077         embed_page_url = result.group(0).strip()
4078         video_id = result.group('videoid')
4079
4080         webpage = self._download_webpage(embed_page_url, video_id)
4081
4082         # Get the video URL
4083         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4084         if result is None:
4085             raise ExtractorError(u'ERROR: unable to extract video url')
4086         video_url = result.group('source')
4087
4088         info = {'id': video_id,
4089                 'url': video_url,
4090                 'title': video_title,
4091                 'ext': 'flv',
4092                 'format': 'flv',
4093                 'player_url': embed_page_url}
4094
4095         return [info]
4096
4097 class EightTracksIE(InfoExtractor):
4098     IE_NAME = '8tracks'
4099     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4100
4101     def _real_extract(self, url):
4102         mobj = re.match(self._VALID_URL, url)
4103         if mobj is None:
4104             raise ExtractorError(u'Invalid URL: %s' % url)
4105         playlist_id = mobj.group('id')
4106
4107         webpage = self._download_webpage(url, playlist_id)
4108
4109         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4110         if not m:
4111             raise ExtractorError(u'Cannot find trax information')
4112         json_like = m.group(1)
4113         data = json.loads(json_like)
4114
4115         session = str(random.randint(0, 1000000000))
4116         mix_id = data['id']
4117         track_count = data['tracks_count']
4118         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4119         next_url = first_url
4120         res = []
4121         for i in itertools.count():
4122             api_json = self._download_webpage(next_url, playlist_id,
4123                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4124                 errnote=u'Failed to download song information')
4125             api_data = json.loads(api_json)
4126             track_data = api_data[u'set']['track']
4127             info = {
4128                 'id': track_data['id'],
4129                 'url': track_data['track_file_stream_url'],
4130                 'title': track_data['performer'] + u' - ' + track_data['name'],
4131                 'raw_title': track_data['name'],
4132                 'uploader_id': data['user']['login'],
4133                 'ext': 'm4a',
4134             }
4135             res.append(info)
4136             if api_data['set']['at_last_track']:
4137                 break
4138             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4139         return res
4140
4141 class KeekIE(InfoExtractor):
4142     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4143     IE_NAME = u'keek'
4144
4145     def _real_extract(self, url):
4146         m = re.match(self._VALID_URL, url)
4147         video_id = m.group('videoID')
4148         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4149         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4150         webpage = self._download_webpage(url, video_id)
4151         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4152         title = unescapeHTML(m.group('title'))
4153         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4154         uploader = clean_html(m.group('uploader'))
4155         info = {
4156                 'id': video_id,
4157                 'url': video_url,
4158                 'ext': 'mp4',
4159                 'title': title,
4160                 'thumbnail': thumbnail,
4161                 'uploader': uploader
4162         }
4163         return [info]
4164
4165 class TEDIE(InfoExtractor):
4166     _VALID_URL=r'''http://www.ted.com/
4167                    (
4168                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4169                         |
4170                         ((?P<type_talk>talks)) # We have a simple talk
4171                    )
4172                    /(?P<name>\w+) # Here goes the name and then ".html"
4173                    '''
4174
4175     @classmethod
4176     def suitable(cls, url):
4177         """Receives a URL and returns True if suitable for this IE."""
4178         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4179
4180     def _real_extract(self, url):
4181         m=re.match(self._VALID_URL, url, re.VERBOSE)
4182         if m.group('type_talk'):
4183             return [self._talk_info(url)]
4184         else :
4185             playlist_id=m.group('playlist_id')
4186             name=m.group('name')
4187             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4188             return [self._playlist_videos_info(url,name,playlist_id)]
4189
4190     def _talk_video_link(self,mediaSlug):
4191         '''Returns the video link for that mediaSlug'''
4192         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4193
4194     def _playlist_videos_info(self,url,name,playlist_id=0):
4195         '''Returns the videos of the playlist'''
4196         video_RE=r'''
4197                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4198                      ([.\s]*?)data-playlist_item_id="(\d+)"
4199                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4200                      '''
4201         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4202         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4203         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4204         m_names=re.finditer(video_name_RE,webpage)
4205
4206         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4207         m_playlist = re.search(playlist_RE, webpage)
4208         playlist_title = m_playlist.group('playlist_title')
4209
4210         playlist_entries = []
4211         for m_video, m_name in zip(m_videos,m_names):
4212             video_id=m_video.group('video_id')
4213             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4214             playlist_entries.append(self.url_result(talk_url, 'TED'))
4215         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4216
4217     def _talk_info(self, url, video_id=0):
4218         """Return the video for the talk in the url"""
4219         m=re.match(self._VALID_URL, url,re.VERBOSE)
4220         videoName=m.group('name')
4221         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4222         # If the url includes the language we get the title translated
4223         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4224         title=re.search(title_RE, webpage).group('title')
4225         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4226                         "id":(?P<videoID>[\d]+).*?
4227                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4228         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4229         thumb_match=re.search(thumb_RE,webpage)
4230         info_match=re.search(info_RE,webpage,re.VERBOSE)
4231         video_id=info_match.group('videoID')
4232         mediaSlug=info_match.group('mediaSlug')
4233         video_url=self._talk_video_link(mediaSlug)
4234         info = {
4235                 'id': video_id,
4236                 'url': video_url,
4237                 'ext': 'mp4',
4238                 'title': title,
4239                 'thumbnail': thumb_match.group('thumbnail')
4240                 }
4241         return info
4242
4243 class MySpassIE(InfoExtractor):
4244     _VALID_URL = r'http://www.myspass.de/.*'
4245
4246     def _real_extract(self, url):
4247         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4248
4249         # video id is the last path element of the URL
4250         # usually there is a trailing slash, so also try the second but last
4251         url_path = compat_urllib_parse_urlparse(url).path
4252         url_parent_path, video_id = os.path.split(url_path)
4253         if not video_id:
4254             _, video_id = os.path.split(url_parent_path)
4255
4256         # get metadata
4257         metadata_url = META_DATA_URL_TEMPLATE % video_id
4258         metadata_text = self._download_webpage(metadata_url, video_id)
4259         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4260
4261         # extract values from metadata
4262         url_flv_el = metadata.find('url_flv')
4263         if url_flv_el is None:
4264             self._downloader.report_error(u'unable to extract download url')
4265             return
4266         video_url = url_flv_el.text
4267         extension = os.path.splitext(video_url)[1][1:]
4268         title_el = metadata.find('title')
4269         if title_el is None:
4270             self._downloader.report_error(u'unable to extract title')
4271             return
4272         title = title_el.text
4273         format_id_el = metadata.find('format_id')
4274         if format_id_el is None:
4275             format = ext
4276         else:
4277             format = format_id_el.text
4278         description_el = metadata.find('description')
4279         if description_el is not None:
4280             description = description_el.text
4281         else:
4282             description = None
4283         imagePreview_el = metadata.find('imagePreview')
4284         if imagePreview_el is not None:
4285             thumbnail = imagePreview_el.text
4286         else:
4287             thumbnail = None
4288         info = {
4289             'id': video_id,
4290             'url': video_url,
4291             'title': title,
4292             'ext': extension,
4293             'format': format,
4294             'thumbnail': thumbnail,
4295             'description': description
4296         }
4297         return [info]
4298
4299 class SpiegelIE(InfoExtractor):
4300     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4301
4302     def _real_extract(self, url):
4303         m = re.match(self._VALID_URL, url)
4304         video_id = m.group('videoID')
4305
4306         webpage = self._download_webpage(url, video_id)
4307         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4308         if not m:
4309             raise ExtractorError(u'Cannot find title')
4310         video_title = unescapeHTML(m.group(1))
4311
4312         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4313         xml_code = self._download_webpage(xml_url, video_id,
4314                     note=u'Downloading XML', errnote=u'Failed to download XML')
4315
4316         idoc = xml.etree.ElementTree.fromstring(xml_code)
4317         last_type = idoc[-1]
4318         filename = last_type.findall('./filename')[0].text
4319         duration = float(last_type.findall('./duration')[0].text)
4320
4321         video_url = 'http://video2.spiegel.de/flash/' + filename
4322         video_ext = filename.rpartition('.')[2]
4323         info = {
4324             'id': video_id,
4325             'url': video_url,
4326             'ext': video_ext,
4327             'title': video_title,
4328             'duration': duration,
4329         }
4330         return [info]
4331
4332 class LiveLeakIE(InfoExtractor):
4333
4334     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4335     IE_NAME = u'liveleak'
4336
4337     def _real_extract(self, url):
4338         mobj = re.match(self._VALID_URL, url)
4339         if mobj is None:
4340             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4341             return
4342
4343         video_id = mobj.group('video_id')
4344
4345         webpage = self._download_webpage(url, video_id)
4346
4347         m = re.search(r'file: "(.*?)",', webpage)
4348         if not m:
4349             self._downloader.report_error(u'unable to find video url')
4350             return
4351         video_url = m.group(1)
4352
4353         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4354         if not m:
4355             self._downloader.trouble(u'Cannot find video title')
4356         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4357
4358         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4359         if m:
4360             desc = unescapeHTML(m.group('desc'))
4361         else:
4362             desc = None
4363
4364         m = re.search(r'By:.*?(\w+)</a>', webpage)
4365         if m:
4366             uploader = clean_html(m.group(1))
4367         else:
4368             uploader = None
4369
4370         info = {
4371             'id':  video_id,
4372             'url': video_url,
4373             'ext': 'mp4',
4374             'title': title,
4375             'description': desc,
4376             'uploader': uploader
4377         }
4378
4379         return [info]
4380
4381 class ARDIE(InfoExtractor):
4382     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4383     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4384     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4385
4386     def _real_extract(self, url):
4387         # determine video id from url
4388         m = re.match(self._VALID_URL, url)
4389
4390         numid = re.search(r'documentId=([0-9]+)', url)
4391         if numid:
4392             video_id = numid.group(1)
4393         else:
4394             video_id = m.group('video_id')
4395
4396         # determine title and media streams from webpage
4397         html = self._download_webpage(url, video_id)
4398         title = re.search(self._TITLE, html).group('title')
4399         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4400         if not streams:
4401             assert '"fsk"' in html
4402             self._downloader.report_error(u'this video is only available after 8:00 pm')
4403             return
4404
4405         # choose default media type and highest quality for now
4406         stream = max([s for s in streams if int(s["media_type"]) == 0],
4407                      key=lambda s: int(s["quality"]))
4408
4409         # there's two possibilities: RTMP stream or HTTP download
4410         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4411         if stream['rtmp_url']:
4412             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4413             assert stream['video_url'].startswith('mp4:')
4414             info["url"] = stream["rtmp_url"]
4415             info["play_path"] = stream['video_url']
4416         else:
4417             assert stream["video_url"].endswith('.mp4')
4418             info["url"] = stream["video_url"]
4419         return [info]
4420
4421
4422 def gen_extractors():
4423     """ Return a list of an instance of every supported extractor.
4424     The order does matter; the first extractor matched is the one handling the URL.
4425     """
4426     return [
4427         YoutubePlaylistIE(),
4428         YoutubeChannelIE(),
4429         YoutubeUserIE(),
4430         YoutubeSearchIE(),
4431         YoutubeIE(),
4432         MetacafeIE(),
4433         DailymotionIE(),
4434         GoogleSearchIE(),
4435         PhotobucketIE(),
4436         YahooIE(),
4437         YahooSearchIE(),
4438         DepositFilesIE(),
4439         FacebookIE(),
4440         BlipTVUserIE(),
4441         BlipTVIE(),
4442         VimeoIE(),
4443         MyVideoIE(),
4444         ComedyCentralIE(),
4445         EscapistIE(),
4446         CollegeHumorIE(),
4447         XVideosIE(),
4448         SoundcloudSetIE(),
4449         SoundcloudIE(),
4450         InfoQIE(),
4451         MixcloudIE(),
4452         StanfordOpenClassroomIE(),
4453         MTVIE(),
4454         YoukuIE(),
4455         XNXXIE(),
4456         YouJizzIE(),
4457         PornotubeIE(),
4458         YouPornIE(),
4459         GooglePlusIE(),
4460         ArteTvIE(),
4461         NBAIE(),
4462         WorldStarHipHopIE(),
4463         JustinTVIE(),
4464         FunnyOrDieIE(),
4465         SteamIE(),
4466         UstreamIE(),
4467         RBMARadioIE(),
4468         EightTracksIE(),
4469         KeekIE(),
4470         TEDIE(),
4471         MySpassIE(),
4472         SpiegelIE(),
4473         LiveLeakIE(),
4474         ARDIE(),
4475         GenericIE()
4476     ]
4477
4478 def get_info_extractor(ie_name):
4479     """Returns the info extractor class with the given ie_name"""
4480     return globals()[ie_name+'IE']