Fix some metacafe videos, closes #562
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self._downloader.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146         
147     #Methods for following #608
148     #They set the correct value of the '_type' key
149     def video_result(self, video_info):
150         """Returns a video"""
151         video_info['_type'] = 'video'
152         return video_info
153     def url_result(self, url, ie=None):
154         """Returns a url that points to a page that should be processed"""
155         #TODO: ie should be the class used for getting the info
156         video_info = {'_type': 'url',
157                       'url': url}
158         return video_info
159     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
160         """Returns a playlist"""
161         video_info = {'_type': 'playlist',
162                       'entries': entries}
163         if playlist_id:
164             video_info['id'] = playlist_id
165         if playlist_title:
166             video_info['title'] = playlist_title
167         return video_info
168
169
170 class YoutubeIE(InfoExtractor):
171     """Information extractor for youtube.com."""
172
173     _VALID_URL = r"""^
174                      (
175                          (?:https?://)?                                       # http(s):// (optional)
176                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
177                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
178                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
179                          (?:                                                  # the various things that can precede the ID:
180                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
181                              |(?:                                             # or the v= param in all its forms
182                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
183                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
184                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
185                                  v=
186                              )
187                          )?                                                   # optional -> youtube.com/xxxx is OK
188                      )?                                                       # all until now is optional -> you can pass the naked ID
189                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
190                      (?(1).+)?                                                # if we found the ID, everything can follow
191                      $"""
192     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
193     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
194     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
195     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
196     _NETRC_MACHINE = 'youtube'
197     # Listed in order of quality
198     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
199     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
200     _video_extensions = {
201         '13': '3gp',
202         '17': 'mp4',
203         '18': 'mp4',
204         '22': 'mp4',
205         '37': 'mp4',
206         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
207         '43': 'webm',
208         '44': 'webm',
209         '45': 'webm',
210         '46': 'webm',
211     }
212     _video_dimensions = {
213         '5': '240x400',
214         '6': '???',
215         '13': '???',
216         '17': '144x176',
217         '18': '360x640',
218         '22': '720x1280',
219         '34': '360x640',
220         '35': '480x854',
221         '37': '1080x1920',
222         '38': '3072x4096',
223         '43': '360x640',
224         '44': '480x854',
225         '45': '720x1280',
226         '46': '1080x1920',
227     }
228     IE_NAME = u'youtube'
229
230     @classmethod
231     def suitable(cls, url):
232         """Receives a URL and returns True if suitable for this IE."""
233         if YoutubePlaylistIE.suitable(url): return False
234         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
235
236     def report_lang(self):
237         """Report attempt to set language."""
238         self._downloader.to_screen(u'[youtube] Setting language')
239
240     def report_login(self):
241         """Report attempt to log in."""
242         self._downloader.to_screen(u'[youtube] Logging in')
243
244     def report_age_confirmation(self):
245         """Report attempt to confirm age."""
246         self._downloader.to_screen(u'[youtube] Confirming age')
247
248     def report_video_webpage_download(self, video_id):
249         """Report attempt to download video webpage."""
250         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
251
252     def report_video_info_webpage_download(self, video_id):
253         """Report attempt to download video info webpage."""
254         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
255
256     def report_video_subtitles_download(self, video_id):
257         """Report attempt to download video info webpage."""
258         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
259
260     def report_video_subtitles_request(self, video_id, sub_lang, format):
261         """Report attempt to download video info webpage."""
262         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
263
264     def report_video_subtitles_available(self, video_id, sub_lang_list):
265         """Report available subtitles."""
266         sub_lang = ",".join(list(sub_lang_list.keys()))
267         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
268
269     def report_information_extraction(self, video_id):
270         """Report attempt to extract video information."""
271         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
272
273     def report_unavailable_format(self, video_id, format):
274         """Report extracted video URL."""
275         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
276
277     def report_rtmp_download(self):
278         """Indicate the download will use the RTMP protocol."""
279         self._downloader.to_screen(u'[youtube] RTMP download detected')
280
281     def _get_available_subtitles(self, video_id):
282         self.report_video_subtitles_download(video_id)
283         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
284         try:
285             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
286         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
287             return (u'unable to download video subtitles: %s' % compat_str(err), None)
288         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
289         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
290         if not sub_lang_list:
291             return (u'video doesn\'t have subtitles', None)
292         return sub_lang_list
293
294     def _list_available_subtitles(self, video_id):
295         sub_lang_list = self._get_available_subtitles(video_id)
296         self.report_video_subtitles_available(video_id, sub_lang_list)
297
298     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
299         """
300         Return tuple:
301         (error_message, sub_lang, sub)
302         """
303         self.report_video_subtitles_request(video_id, sub_lang, format)
304         params = compat_urllib_parse.urlencode({
305             'lang': sub_lang,
306             'name': sub_name,
307             'v': video_id,
308             'fmt': format,
309         })
310         url = 'http://www.youtube.com/api/timedtext?' + params
311         try:
312             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
313         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
315         if not sub:
316             return (u'Did not fetch video subtitles', None, None)
317         return (None, sub_lang, sub)
318
319     def _extract_subtitle(self, video_id):
320         """
321         Return a list with a tuple:
322         [(error_message, sub_lang, sub)]
323         """
324         sub_lang_list = self._get_available_subtitles(video_id)
325         sub_format = self._downloader.params.get('subtitlesformat')
326         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
327             return [(sub_lang_list[0], None, None)]
328         if self._downloader.params.get('subtitleslang', False):
329             sub_lang = self._downloader.params.get('subtitleslang')
330         elif 'en' in sub_lang_list:
331             sub_lang = 'en'
332         else:
333             sub_lang = list(sub_lang_list.keys())[0]
334         if not sub_lang in sub_lang_list:
335             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
336
337         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
338         return [subtitle]
339
340     def _extract_all_subtitles(self, video_id):
341         sub_lang_list = self._get_available_subtitles(video_id)
342         sub_format = self._downloader.params.get('subtitlesformat')
343         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
344             return [(sub_lang_list[0], None, None)]
345         subtitles = []
346         for sub_lang in sub_lang_list:
347             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
348             subtitles.append(subtitle)
349         return subtitles
350
351     def _print_formats(self, formats):
352         print('Available formats:')
353         for x in formats:
354             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
355
356     def _real_initialize(self):
357         if self._downloader is None:
358             return
359
360         username = None
361         password = None
362         downloader_params = self._downloader.params
363
364         # Attempt to use provided username and password or .netrc data
365         if downloader_params.get('username', None) is not None:
366             username = downloader_params['username']
367             password = downloader_params['password']
368         elif downloader_params.get('usenetrc', False):
369             try:
370                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
371                 if info is not None:
372                     username = info[0]
373                     password = info[2]
374                 else:
375                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376             except (IOError, netrc.NetrcParseError) as err:
377                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
378                 return
379
380         # Set language
381         request = compat_urllib_request.Request(self._LANG_URL)
382         try:
383             self.report_lang()
384             compat_urllib_request.urlopen(request).read()
385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
386             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
387             return
388
389         # No authentication to be performed
390         if username is None:
391             return
392
393         request = compat_urllib_request.Request(self._LOGIN_URL)
394         try:
395             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
398             return
399
400         galx = None
401         dsh = None
402         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
403         if match:
404           galx = match.group(1)
405
406         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
407         if match:
408           dsh = match.group(1)
409
410         # Log in
411         login_form_strs = {
412                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
413                 u'Email': username,
414                 u'GALX': galx,
415                 u'Passwd': password,
416                 u'PersistentCookie': u'yes',
417                 u'_utf8': u'霱',
418                 u'bgresponse': u'js_disabled',
419                 u'checkConnection': u'',
420                 u'checkedDomains': u'youtube',
421                 u'dnConn': u'',
422                 u'dsh': dsh,
423                 u'pstMsg': u'0',
424                 u'rmShown': u'1',
425                 u'secTok': u'',
426                 u'signIn': u'Sign in',
427                 u'timeStmp': u'',
428                 u'service': u'youtube',
429                 u'uilel': u'3',
430                 u'hl': u'en_US',
431         }
432         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
433         # chokes on unicode
434         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
435         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
436         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
437         try:
438             self.report_login()
439             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
440             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
441                 self._downloader.report_warning(u'unable to log in: bad username or password')
442                 return
443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
444             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
445             return
446
447         # Confirm age
448         age_form = {
449                 'next_url':     '/',
450                 'action_confirm':   'Confirm',
451                 }
452         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
453         try:
454             self.report_age_confirmation()
455             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
458             return
459
460     def _extract_id(self, url):
461         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
462         if mobj is None:
463             self._downloader.report_error(u'invalid URL: %s' % url)
464             return
465         video_id = mobj.group(2)
466         return video_id
467
468     def _real_extract(self, url):
469         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
470         mobj = re.search(self._NEXT_URL_RE, url)
471         if mobj:
472             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
473         video_id = self._extract_id(url)
474
475         # Get video webpage
476         self.report_video_webpage_download(video_id)
477         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
478         request = compat_urllib_request.Request(url)
479         try:
480             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
481         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
482             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
483             return
484
485         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
486
487         # Attempt to extract SWF player URL
488         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
489         if mobj is not None:
490             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
491         else:
492             player_url = None
493
494         # Get video info
495         self.report_video_info_webpage_download(video_id)
496         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
497             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
498                     % (video_id, el_type))
499             video_info_webpage = self._download_webpage(video_info_url, video_id,
500                                     note=False,
501                                     errnote='unable to download video info webpage')
502             video_info = compat_parse_qs(video_info_webpage)
503             if 'token' in video_info:
504                 break
505         if 'token' not in video_info:
506             if 'reason' in video_info:
507                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
508             else:
509                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
510             return
511
512         # Check for "rental" videos
513         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
514             self._downloader.report_error(u'"rental" videos not supported')
515             return
516
517         # Start extracting information
518         self.report_information_extraction(video_id)
519
520         # uploader
521         if 'author' not in video_info:
522             self._downloader.report_error(u'unable to extract uploader name')
523             return
524         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
525
526         # uploader_id
527         video_uploader_id = None
528         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
529         if mobj is not None:
530             video_uploader_id = mobj.group(1)
531         else:
532             self._downloader.report_warning(u'unable to extract uploader nickname')
533
534         # title
535         if 'title' not in video_info:
536             self._downloader.report_error(u'unable to extract video title')
537             return
538         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
539
540         # thumbnail image
541         if 'thumbnail_url' not in video_info:
542             self._downloader.report_warning(u'unable to extract video thumbnail')
543             video_thumbnail = ''
544         else:   # don't panic if we can't find it
545             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
546
547         # upload date
548         upload_date = None
549         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
550         if mobj is not None:
551             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
552             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
553             for expression in format_expressions:
554                 try:
555                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
556                 except:
557                     pass
558
559         # description
560         video_description = get_element_by_id("eow-description", video_webpage)
561         if video_description:
562             video_description = clean_html(video_description)
563         else:
564             video_description = ''
565
566         # subtitles
567         video_subtitles = None
568
569         if self._downloader.params.get('writesubtitles', False):
570             video_subtitles = self._extract_subtitle(video_id)
571             if video_subtitles:
572                 (sub_error, sub_lang, sub) = video_subtitles[0]
573                 if sub_error:
574                     self._downloader.report_error(sub_error)
575
576         if self._downloader.params.get('allsubtitles', False):
577             video_subtitles = self._extract_all_subtitles(video_id)
578             for video_subtitle in video_subtitles:
579                 (sub_error, sub_lang, sub) = video_subtitle
580                 if sub_error:
581                     self._downloader.report_error(sub_error)
582
583         if self._downloader.params.get('listsubtitles', False):
584             sub_lang_list = self._list_available_subtitles(video_id)
585             return
586
587         if 'length_seconds' not in video_info:
588             self._downloader.report_warning(u'unable to extract video duration')
589             video_duration = ''
590         else:
591             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
592
593         # token
594         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
595
596         # Decide which formats to download
597         req_format = self._downloader.params.get('format', None)
598
599         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
600             self.report_rtmp_download()
601             video_url_list = [(None, video_info['conn'][0])]
602         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
603             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
604             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
605             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
606             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
607
608             format_limit = self._downloader.params.get('format_limit', None)
609             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
610             if format_limit is not None and format_limit in available_formats:
611                 format_list = available_formats[available_formats.index(format_limit):]
612             else:
613                 format_list = available_formats
614             existing_formats = [x for x in format_list if x in url_map]
615             if len(existing_formats) == 0:
616                 self._downloader.report_error(u'no known formats available for video')
617                 return
618             if self._downloader.params.get('listformats', None):
619                 self._print_formats(existing_formats)
620                 return
621             if req_format is None or req_format == 'best':
622                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
623             elif req_format == 'worst':
624                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
625             elif req_format in ('-1', 'all'):
626                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
627             else:
628                 # Specific formats. We pick the first in a slash-delimeted sequence.
629                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
630                 req_formats = req_format.split('/')
631                 video_url_list = None
632                 for rf in req_formats:
633                     if rf in url_map:
634                         video_url_list = [(rf, url_map[rf])]
635                         break
636                 if video_url_list is None:
637                     self._downloader.report_error(u'requested format not available')
638                     return
639         else:
640             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
641             return
642
643         results = []
644         for format_param, video_real_url in video_url_list:
645             # Extension
646             video_extension = self._video_extensions.get(format_param, 'flv')
647
648             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
649                                               self._video_dimensions.get(format_param, '???'))
650
651             results.append({
652                 'id':       video_id,
653                 'url':      video_real_url,
654                 'uploader': video_uploader,
655                 'uploader_id': video_uploader_id,
656                 'upload_date':  upload_date,
657                 'title':    video_title,
658                 'ext':      video_extension,
659                 'format':   video_format,
660                 'thumbnail':    video_thumbnail,
661                 'description':  video_description,
662                 'player_url':   player_url,
663                 'subtitles':    video_subtitles,
664                 'duration':     video_duration
665             })
666         return results
667
668
669 class MetacafeIE(InfoExtractor):
670     """Information Extractor for metacafe.com."""
671
672     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
673     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
674     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
675     IE_NAME = u'metacafe'
676
677     def __init__(self, downloader=None):
678         InfoExtractor.__init__(self, downloader)
679
680     def report_disclaimer(self):
681         """Report disclaimer retrieval."""
682         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
683
684     def report_age_confirmation(self):
685         """Report attempt to confirm age."""
686         self._downloader.to_screen(u'[metacafe] Confirming age')
687
688     def report_download_webpage(self, video_id):
689         """Report webpage download."""
690         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
691
692     def report_extraction(self, video_id):
693         """Report information extraction."""
694         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
695
696     def _real_initialize(self):
697         # Retrieve disclaimer
698         request = compat_urllib_request.Request(self._DISCLAIMER)
699         try:
700             self.report_disclaimer()
701             disclaimer = compat_urllib_request.urlopen(request).read()
702         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
703             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
704             return
705
706         # Confirm age
707         disclaimer_form = {
708             'filters': '0',
709             'submit': "Continue - I'm over 18",
710             }
711         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712         try:
713             self.report_age_confirmation()
714             disclaimer = compat_urllib_request.urlopen(request).read()
715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
717             return
718
719     def _real_extract(self, url):
720         # Extract id and simplified title from URL
721         mobj = re.match(self._VALID_URL, url)
722         if mobj is None:
723             self._downloader.report_error(u'invalid URL: %s' % url)
724             return
725
726         video_id = mobj.group(1)
727
728         # Check if video comes from YouTube
729         mobj2 = re.match(r'^yt-(.*)$', video_id)
730         if mobj2 is not None:
731             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
732
733         # Retrieve video webpage to extract further information
734         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
735
736         # Extract URL, uploader and title from webpage
737         self.report_extraction(video_id)
738         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
739         if mobj is not None:
740             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
741             video_extension = mediaURL[-3:]
742
743             # Extract gdaKey if available
744             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
745             if mobj is None:
746                 video_url = mediaURL
747             else:
748                 gdaKey = mobj.group(1)
749                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
750         else:
751             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
752             if mobj is None:
753                 self._downloader.report_error(u'unable to extract media URL')
754                 return
755             vardict = compat_parse_qs(mobj.group(1))
756             if 'mediaData' not in vardict:
757                 self._downloader.report_error(u'unable to extract media URL')
758                 return
759             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
760             if mobj is None:
761                 self._downloader.report_error(u'unable to extract media URL')
762                 return
763             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
764             video_extension = mediaURL[-3:]
765             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
766
767         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
768         if mobj is None:
769             self._downloader.report_error(u'unable to extract title')
770             return
771         video_title = mobj.group(1).decode('utf-8')
772
773         mobj = re.search(r'submitter=(.*?);', webpage)
774         if mobj is None:
775             self._downloader.report_error(u'unable to extract uploader nickname')
776             return
777         video_uploader = mobj.group(1)
778
779         return [{
780             'id':       video_id.decode('utf-8'),
781             'url':      video_url.decode('utf-8'),
782             'uploader': video_uploader.decode('utf-8'),
783             'upload_date':  None,
784             'title':    video_title,
785             'ext':      video_extension.decode('utf-8'),
786         }]
787
788
789 class DailymotionIE(InfoExtractor):
790     """Information Extractor for Dailymotion"""
791
792     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
793     IE_NAME = u'dailymotion'
794     _WORKING = False
795
796     def __init__(self, downloader=None):
797         InfoExtractor.__init__(self, downloader)
798
799     def report_extraction(self, video_id):
800         """Report information extraction."""
801         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
802
803     def _real_extract(self, url):
804         # Extract id and simplified title from URL
805         mobj = re.match(self._VALID_URL, url)
806         if mobj is None:
807             self._downloader.report_error(u'invalid URL: %s' % url)
808             return
809
810         video_id = mobj.group(1).split('_')[0].split('?')[0]
811
812         video_extension = 'mp4'
813
814         # Retrieve video webpage to extract further information
815         request = compat_urllib_request.Request(url)
816         request.add_header('Cookie', 'family_filter=off')
817         webpage = self._download_webpage(request, video_id)
818
819         # Extract URL, uploader and title from webpage
820         self.report_extraction(video_id)
821         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
822         if mobj is None:
823             self._downloader.report_error(u'unable to extract media URL')
824             return
825         flashvars = compat_urllib_parse.unquote(mobj.group(1))
826
827         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
828             if key in flashvars:
829                 max_quality = key
830                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
831                 break
832         else:
833             self._downloader.report_error(u'unable to extract video URL')
834             return
835
836         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
837         if mobj is None:
838             self._downloader.report_error(u'unable to extract video URL')
839             return
840
841         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
842
843         # TODO: support choosing qualities
844
845         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
846         if mobj is None:
847             self._downloader.report_error(u'unable to extract title')
848             return
849         video_title = unescapeHTML(mobj.group('title'))
850
851         video_uploader = None
852         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
853         if mobj is None:
854             # lookin for official user
855             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
856             if mobj_official is None:
857                 self._downloader.report_warning(u'unable to extract uploader nickname')
858             else:
859                 video_uploader = mobj_official.group(1)
860         else:
861             video_uploader = mobj.group(1)
862
863         video_upload_date = None
864         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
865         if mobj is not None:
866             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
867
868         return [{
869             'id':       video_id,
870             'url':      video_url,
871             'uploader': video_uploader,
872             'upload_date':  video_upload_date,
873             'title':    video_title,
874             'ext':      video_extension,
875         }]
876
877
878 class PhotobucketIE(InfoExtractor):
879     """Information extractor for photobucket.com."""
880
881     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
882     IE_NAME = u'photobucket'
883
884     def __init__(self, downloader=None):
885         InfoExtractor.__init__(self, downloader)
886
887     def report_download_webpage(self, video_id):
888         """Report webpage download."""
889         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
890
891     def report_extraction(self, video_id):
892         """Report information extraction."""
893         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
894
895     def _real_extract(self, url):
896         # Extract id from URL
897         mobj = re.match(self._VALID_URL, url)
898         if mobj is None:
899             self._downloader.report_error(u'Invalid URL: %s' % url)
900             return
901
902         video_id = mobj.group(1)
903
904         video_extension = 'flv'
905
906         # Retrieve video webpage to extract further information
907         request = compat_urllib_request.Request(url)
908         try:
909             self.report_download_webpage(video_id)
910             webpage = compat_urllib_request.urlopen(request).read()
911         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
913             return
914
915         # Extract URL, uploader, and title from webpage
916         self.report_extraction(video_id)
917         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
918         if mobj is None:
919             self._downloader.report_error(u'unable to extract media URL')
920             return
921         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
922
923         video_url = mediaURL
924
925         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
926         if mobj is None:
927             self._downloader.report_error(u'unable to extract title')
928             return
929         video_title = mobj.group(1).decode('utf-8')
930
931         video_uploader = mobj.group(2).decode('utf-8')
932
933         return [{
934             'id':       video_id.decode('utf-8'),
935             'url':      video_url.decode('utf-8'),
936             'uploader': video_uploader,
937             'upload_date':  None,
938             'title':    video_title,
939             'ext':      video_extension.decode('utf-8'),
940         }]
941
942
943 class YahooIE(InfoExtractor):
944     """Information extractor for video.yahoo.com."""
945
946     _WORKING = False
947     # _VALID_URL matches all Yahoo! Video URLs
948     # _VPAGE_URL matches only the extractable '/watch/' URLs
949     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
950     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
951     IE_NAME = u'video.yahoo'
952
953     def __init__(self, downloader=None):
954         InfoExtractor.__init__(self, downloader)
955
956     def report_download_webpage(self, video_id):
957         """Report webpage download."""
958         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
959
960     def report_extraction(self, video_id):
961         """Report information extraction."""
962         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
963
964     def _real_extract(self, url, new_video=True):
965         # Extract ID from URL
966         mobj = re.match(self._VALID_URL, url)
967         if mobj is None:
968             self._downloader.report_error(u'Invalid URL: %s' % url)
969             return
970
971         video_id = mobj.group(2)
972         video_extension = 'flv'
973
974         # Rewrite valid but non-extractable URLs as
975         # extractable English language /watch/ URLs
976         if re.match(self._VPAGE_URL, url) is None:
977             request = compat_urllib_request.Request(url)
978             try:
979                 webpage = compat_urllib_request.urlopen(request).read()
980             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
981                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
982                 return
983
984             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
985             if mobj is None:
986                 self._downloader.report_error(u'Unable to extract id field')
987                 return
988             yahoo_id = mobj.group(1)
989
990             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
991             if mobj is None:
992                 self._downloader.report_error(u'Unable to extract vid field')
993                 return
994             yahoo_vid = mobj.group(1)
995
996             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
997             return self._real_extract(url, new_video=False)
998
999         # Retrieve video webpage to extract further information
1000         request = compat_urllib_request.Request(url)
1001         try:
1002             self.report_download_webpage(video_id)
1003             webpage = compat_urllib_request.urlopen(request).read()
1004         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1006             return
1007
1008         # Extract uploader and title from webpage
1009         self.report_extraction(video_id)
1010         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1011         if mobj is None:
1012             self._downloader.report_error(u'unable to extract video title')
1013             return
1014         video_title = mobj.group(1).decode('utf-8')
1015
1016         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1017         if mobj is None:
1018             self._downloader.report_error(u'unable to extract video uploader')
1019             return
1020         video_uploader = mobj.group(1).decode('utf-8')
1021
1022         # Extract video thumbnail
1023         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1024         if mobj is None:
1025             self._downloader.report_error(u'unable to extract video thumbnail')
1026             return
1027         video_thumbnail = mobj.group(1).decode('utf-8')
1028
1029         # Extract video description
1030         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1031         if mobj is None:
1032             self._downloader.report_error(u'unable to extract video description')
1033             return
1034         video_description = mobj.group(1).decode('utf-8')
1035         if not video_description:
1036             video_description = 'No description available.'
1037
1038         # Extract video height and width
1039         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1040         if mobj is None:
1041             self._downloader.report_error(u'unable to extract video height')
1042             return
1043         yv_video_height = mobj.group(1)
1044
1045         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1046         if mobj is None:
1047             self._downloader.report_error(u'unable to extract video width')
1048             return
1049         yv_video_width = mobj.group(1)
1050
1051         # Retrieve video playlist to extract media URL
1052         # I'm not completely sure what all these options are, but we
1053         # seem to need most of them, otherwise the server sends a 401.
1054         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1055         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1056         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1057                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1058                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1059         try:
1060             self.report_download_webpage(video_id)
1061             webpage = compat_urllib_request.urlopen(request).read()
1062         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1063             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1064             return
1065
1066         # Extract media URL from playlist XML
1067         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1068         if mobj is None:
1069             self._downloader.report_error(u'Unable to extract media URL')
1070             return
1071         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1072         video_url = unescapeHTML(video_url)
1073
1074         return [{
1075             'id':       video_id.decode('utf-8'),
1076             'url':      video_url,
1077             'uploader': video_uploader,
1078             'upload_date':  None,
1079             'title':    video_title,
1080             'ext':      video_extension.decode('utf-8'),
1081             'thumbnail':    video_thumbnail.decode('utf-8'),
1082             'description':  video_description,
1083         }]
1084
1085
1086 class VimeoIE(InfoExtractor):
1087     """Information extractor for vimeo.com."""
1088
1089     # _VALID_URL matches Vimeo URLs
1090     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1091     IE_NAME = u'vimeo'
1092
1093     def __init__(self, downloader=None):
1094         InfoExtractor.__init__(self, downloader)
1095
1096     def report_download_webpage(self, video_id):
1097         """Report webpage download."""
1098         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1099
1100     def report_extraction(self, video_id):
1101         """Report information extraction."""
1102         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1103
1104     def _real_extract(self, url, new_video=True):
1105         # Extract ID from URL
1106         mobj = re.match(self._VALID_URL, url)
1107         if mobj is None:
1108             self._downloader.report_error(u'Invalid URL: %s' % url)
1109             return
1110
1111         video_id = mobj.group('id')
1112         if not mobj.group('proto'):
1113             url = 'https://' + url
1114         if mobj.group('direct_link'):
1115             url = 'https://vimeo.com/' + video_id
1116
1117         # Retrieve video webpage to extract further information
1118         request = compat_urllib_request.Request(url, None, std_headers)
1119         try:
1120             self.report_download_webpage(video_id)
1121             webpage_bytes = compat_urllib_request.urlopen(request).read()
1122             webpage = webpage_bytes.decode('utf-8')
1123         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1124             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1125             return
1126
1127         # Now we begin extracting as much information as we can from what we
1128         # retrieved. First we extract the information common to all extractors,
1129         # and latter we extract those that are Vimeo specific.
1130         self.report_extraction(video_id)
1131
1132         # Extract the config JSON
1133         try:
1134             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1135             config = json.loads(config)
1136         except:
1137             self._downloader.report_error(u'unable to extract info section')
1138             return
1139
1140         # Extract title
1141         video_title = config["video"]["title"]
1142
1143         # Extract uploader and uploader_id
1144         video_uploader = config["video"]["owner"]["name"]
1145         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1146
1147         # Extract video thumbnail
1148         video_thumbnail = config["video"]["thumbnail"]
1149
1150         # Extract video description
1151         video_description = get_element_by_attribute("itemprop", "description", webpage)
1152         if video_description: video_description = clean_html(video_description)
1153         else: video_description = u''
1154
1155         # Extract upload date
1156         video_upload_date = None
1157         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1158         if mobj is not None:
1159             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1160
1161         # Vimeo specific: extract request signature and timestamp
1162         sig = config['request']['signature']
1163         timestamp = config['request']['timestamp']
1164
1165         # Vimeo specific: extract video codec and quality information
1166         # First consider quality, then codecs, then take everything
1167         # TODO bind to format param
1168         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1169         files = { 'hd': [], 'sd': [], 'other': []}
1170         for codec_name, codec_extension in codecs:
1171             if codec_name in config["video"]["files"]:
1172                 if 'hd' in config["video"]["files"][codec_name]:
1173                     files['hd'].append((codec_name, codec_extension, 'hd'))
1174                 elif 'sd' in config["video"]["files"][codec_name]:
1175                     files['sd'].append((codec_name, codec_extension, 'sd'))
1176                 else:
1177                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1178
1179         for quality in ('hd', 'sd', 'other'):
1180             if len(files[quality]) > 0:
1181                 video_quality = files[quality][0][2]
1182                 video_codec = files[quality][0][0]
1183                 video_extension = files[quality][0][1]
1184                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1185                 break
1186         else:
1187             self._downloader.report_error(u'no known codec found')
1188             return
1189
1190         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1191                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1192
1193         return [{
1194             'id':       video_id,
1195             'url':      video_url,
1196             'uploader': video_uploader,
1197             'uploader_id': video_uploader_id,
1198             'upload_date':  video_upload_date,
1199             'title':    video_title,
1200             'ext':      video_extension,
1201             'thumbnail':    video_thumbnail,
1202             'description':  video_description,
1203         }]
1204
1205
1206 class ArteTvIE(InfoExtractor):
1207     """arte.tv information extractor."""
1208
1209     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1210     _LIVE_URL = r'index-[0-9]+\.html$'
1211
1212     IE_NAME = u'arte.tv'
1213
1214     def __init__(self, downloader=None):
1215         InfoExtractor.__init__(self, downloader)
1216
1217     def report_download_webpage(self, video_id):
1218         """Report webpage download."""
1219         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1220
1221     def report_extraction(self, video_id):
1222         """Report information extraction."""
1223         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1224
1225     def fetch_webpage(self, url):
1226         request = compat_urllib_request.Request(url)
1227         try:
1228             self.report_download_webpage(url)
1229             webpage = compat_urllib_request.urlopen(request).read()
1230         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1231             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1232             return
1233         except ValueError as err:
1234             self._downloader.report_error(u'Invalid URL: %s' % url)
1235             return
1236         return webpage
1237
1238     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1239         page = self.fetch_webpage(url)
1240         mobj = re.search(regex, page, regexFlags)
1241         info = {}
1242
1243         if mobj is None:
1244             self._downloader.report_error(u'Invalid URL: %s' % url)
1245             return
1246
1247         for (i, key, err) in matchTuples:
1248             if mobj.group(i) is None:
1249                 self._downloader.trouble(err)
1250                 return
1251             else:
1252                 info[key] = mobj.group(i)
1253
1254         return info
1255
1256     def extractLiveStream(self, url):
1257         video_lang = url.split('/')[-4]
1258         info = self.grep_webpage(
1259             url,
1260             r'src="(.*?/videothek_js.*?\.js)',
1261             0,
1262             [
1263                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1264             ]
1265         )
1266         http_host = url.split('/')[2]
1267         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1268         info = self.grep_webpage(
1269             next_url,
1270             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1271                 '(http://.*?\.swf).*?' +
1272                 '(rtmp://.*?)\'',
1273             re.DOTALL,
1274             [
1275                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1276                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1277                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1278             ]
1279         )
1280         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1281
1282     def extractPlus7Stream(self, url):
1283         video_lang = url.split('/')[-3]
1284         info = self.grep_webpage(
1285             url,
1286             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1287             0,
1288             [
1289                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1290             ]
1291         )
1292         next_url = compat_urllib_parse.unquote(info.get('url'))
1293         info = self.grep_webpage(
1294             next_url,
1295             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1296             0,
1297             [
1298                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1299             ]
1300         )
1301         next_url = compat_urllib_parse.unquote(info.get('url'))
1302
1303         info = self.grep_webpage(
1304             next_url,
1305             r'<video id="(.*?)".*?>.*?' +
1306                 '<name>(.*?)</name>.*?' +
1307                 '<dateVideo>(.*?)</dateVideo>.*?' +
1308                 '<url quality="hd">(.*?)</url>',
1309             re.DOTALL,
1310             [
1311                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1312                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1313                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1314                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1315             ]
1316         )
1317
1318         return {
1319             'id':           info.get('id'),
1320             'url':          compat_urllib_parse.unquote(info.get('url')),
1321             'uploader':     u'arte.tv',
1322             'upload_date':  info.get('date'),
1323             'title':        info.get('title').decode('utf-8'),
1324             'ext':          u'mp4',
1325             'format':       u'NA',
1326             'player_url':   None,
1327         }
1328
1329     def _real_extract(self, url):
1330         video_id = url.split('/')[-1]
1331         self.report_extraction(video_id)
1332
1333         if re.search(self._LIVE_URL, video_id) is not None:
1334             self.extractLiveStream(url)
1335             return
1336         else:
1337             info = self.extractPlus7Stream(url)
1338
1339         return [info]
1340
1341
1342 class GenericIE(InfoExtractor):
1343     """Generic last-resort information extractor."""
1344
1345     _VALID_URL = r'.*'
1346     IE_NAME = u'generic'
1347
1348     def __init__(self, downloader=None):
1349         InfoExtractor.__init__(self, downloader)
1350
1351     def report_download_webpage(self, video_id):
1352         """Report webpage download."""
1353         if not self._downloader.params.get('test', False):
1354             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1355         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1356
1357     def report_extraction(self, video_id):
1358         """Report information extraction."""
1359         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1360
1361     def report_following_redirect(self, new_url):
1362         """Report information extraction."""
1363         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1364
1365     def _test_redirect(self, url):
1366         """Check if it is a redirect, like url shorteners, in case return the new url."""
1367         class HeadRequest(compat_urllib_request.Request):
1368             def get_method(self):
1369                 return "HEAD"
1370
1371         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1372             """
1373             Subclass the HTTPRedirectHandler to make it use our
1374             HeadRequest also on the redirected URL
1375             """
1376             def redirect_request(self, req, fp, code, msg, headers, newurl):
1377                 if code in (301, 302, 303, 307):
1378                     newurl = newurl.replace(' ', '%20')
1379                     newheaders = dict((k,v) for k,v in req.headers.items()
1380                                       if k.lower() not in ("content-length", "content-type"))
1381                     return HeadRequest(newurl,
1382                                        headers=newheaders,
1383                                        origin_req_host=req.get_origin_req_host(),
1384                                        unverifiable=True)
1385                 else:
1386                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1387
1388         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1389             """
1390             Fallback to GET if HEAD is not allowed (405 HTTP error)
1391             """
1392             def http_error_405(self, req, fp, code, msg, headers):
1393                 fp.read()
1394                 fp.close()
1395
1396                 newheaders = dict((k,v) for k,v in req.headers.items()
1397                                   if k.lower() not in ("content-length", "content-type"))
1398                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1399                                                  headers=newheaders,
1400                                                  origin_req_host=req.get_origin_req_host(),
1401                                                  unverifiable=True))
1402
1403         # Build our opener
1404         opener = compat_urllib_request.OpenerDirector()
1405         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1406                         HTTPMethodFallback, HEADRedirectHandler,
1407                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1408             opener.add_handler(handler())
1409
1410         response = opener.open(HeadRequest(url))
1411         new_url = response.geturl()
1412
1413         if url == new_url:
1414             return False
1415
1416         self.report_following_redirect(new_url)
1417         return new_url
1418
1419     def _real_extract(self, url):
1420         new_url = self._test_redirect(url)
1421         if new_url: return [self.url_result(new_url)]
1422
1423         video_id = url.split('/')[-1]
1424         try:
1425             webpage = self._download_webpage(url, video_id)
1426         except ValueError as err:
1427             # since this is the last-resort InfoExtractor, if
1428             # this error is thrown, it'll be thrown here
1429             self._downloader.report_error(u'Invalid URL: %s' % url)
1430             return
1431
1432         self.report_extraction(video_id)
1433         # Start with something easy: JW Player in SWFObject
1434         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1435         if mobj is None:
1436             # Broaden the search a little bit
1437             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1438         if mobj is None:
1439             # Broaden the search a little bit: JWPlayer JS loader
1440             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1441         if mobj is None:
1442             self._downloader.report_error(u'Invalid URL: %s' % url)
1443             return
1444
1445         # It's possible that one of the regexes
1446         # matched, but returned an empty group:
1447         if mobj.group(1) is None:
1448             self._downloader.report_error(u'Invalid URL: %s' % url)
1449             return
1450
1451         video_url = compat_urllib_parse.unquote(mobj.group(1))
1452         video_id = os.path.basename(video_url)
1453
1454         # here's a fun little line of code for you:
1455         video_extension = os.path.splitext(video_id)[1][1:]
1456         video_id = os.path.splitext(video_id)[0]
1457
1458         # it's tempting to parse this further, but you would
1459         # have to take into account all the variations like
1460         #   Video Title - Site Name
1461         #   Site Name | Video Title
1462         #   Video Title - Tagline | Site Name
1463         # and so on and so forth; it's just not practical
1464         mobj = re.search(r'<title>(.*)</title>', webpage)
1465         if mobj is None:
1466             self._downloader.report_error(u'unable to extract title')
1467             return
1468         video_title = mobj.group(1)
1469
1470         # video uploader is domain name
1471         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1472         if mobj is None:
1473             self._downloader.report_error(u'unable to extract title')
1474             return
1475         video_uploader = mobj.group(1)
1476
1477         return [{
1478             'id':       video_id,
1479             'url':      video_url,
1480             'uploader': video_uploader,
1481             'upload_date':  None,
1482             'title':    video_title,
1483             'ext':      video_extension,
1484         }]
1485
1486
1487 class YoutubeSearchIE(InfoExtractor):
1488     """Information Extractor for YouTube search queries."""
1489     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1490     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1491     _max_youtube_results = 1000
1492     IE_NAME = u'youtube:search'
1493
1494     def __init__(self, downloader=None):
1495         InfoExtractor.__init__(self, downloader)
1496
1497     def report_download_page(self, query, pagenum):
1498         """Report attempt to download search page with given number."""
1499         query = query.decode(preferredencoding())
1500         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1501
1502     def _real_extract(self, query):
1503         mobj = re.match(self._VALID_URL, query)
1504         if mobj is None:
1505             self._downloader.report_error(u'invalid search query "%s"' % query)
1506             return
1507
1508         prefix, query = query.split(':')
1509         prefix = prefix[8:]
1510         query = query.encode('utf-8')
1511         if prefix == '':
1512             self._download_n_results(query, 1)
1513             return
1514         elif prefix == 'all':
1515             self._download_n_results(query, self._max_youtube_results)
1516             return
1517         else:
1518             try:
1519                 n = int(prefix)
1520                 if n <= 0:
1521                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1522                     return
1523                 elif n > self._max_youtube_results:
1524                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1525                     n = self._max_youtube_results
1526                 self._download_n_results(query, n)
1527                 return
1528             except ValueError: # parsing prefix as integer fails
1529                 self._download_n_results(query, 1)
1530                 return
1531
1532     def _download_n_results(self, query, n):
1533         """Downloads a specified number of results for a query"""
1534
1535         video_ids = []
1536         pagenum = 0
1537         limit = n
1538
1539         while (50 * pagenum) < limit:
1540             self.report_download_page(query, pagenum+1)
1541             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1542             request = compat_urllib_request.Request(result_url)
1543             try:
1544                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1545             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1546                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1547                 return
1548             api_response = json.loads(data)['data']
1549
1550             if not 'items' in api_response:
1551                 self._downloader.trouble(u'[youtube] No video results')
1552                 return
1553
1554             new_ids = list(video['id'] for video in api_response['items'])
1555             video_ids += new_ids
1556
1557             limit = min(n, api_response['totalItems'])
1558             pagenum += 1
1559
1560         if len(video_ids) > n:
1561             video_ids = video_ids[:n]
1562         for id in video_ids:
1563             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1564         return
1565
1566
1567 class GoogleSearchIE(InfoExtractor):
1568     """Information Extractor for Google Video search queries."""
1569     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1570     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1571     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1572     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1573     _max_google_results = 1000
1574     IE_NAME = u'video.google:search'
1575
1576     def __init__(self, downloader=None):
1577         InfoExtractor.__init__(self, downloader)
1578
1579     def report_download_page(self, query, pagenum):
1580         """Report attempt to download playlist page with given number."""
1581         query = query.decode(preferredencoding())
1582         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1583
1584     def _real_extract(self, query):
1585         mobj = re.match(self._VALID_URL, query)
1586         if mobj is None:
1587             self._downloader.report_error(u'invalid search query "%s"' % query)
1588             return
1589
1590         prefix, query = query.split(':')
1591         prefix = prefix[8:]
1592         query = query.encode('utf-8')
1593         if prefix == '':
1594             self._download_n_results(query, 1)
1595             return
1596         elif prefix == 'all':
1597             self._download_n_results(query, self._max_google_results)
1598             return
1599         else:
1600             try:
1601                 n = int(prefix)
1602                 if n <= 0:
1603                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1604                     return
1605                 elif n > self._max_google_results:
1606                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1607                     n = self._max_google_results
1608                 self._download_n_results(query, n)
1609                 return
1610             except ValueError: # parsing prefix as integer fails
1611                 self._download_n_results(query, 1)
1612                 return
1613
1614     def _download_n_results(self, query, n):
1615         """Downloads a specified number of results for a query"""
1616
1617         video_ids = []
1618         pagenum = 0
1619
1620         while True:
1621             self.report_download_page(query, pagenum)
1622             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1623             request = compat_urllib_request.Request(result_url)
1624             try:
1625                 page = compat_urllib_request.urlopen(request).read()
1626             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1627                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1628                 return
1629
1630             # Extract video identifiers
1631             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1632                 video_id = mobj.group(1)
1633                 if video_id not in video_ids:
1634                     video_ids.append(video_id)
1635                     if len(video_ids) == n:
1636                         # Specified n videos reached
1637                         for id in video_ids:
1638                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1639                         return
1640
1641             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1642                 for id in video_ids:
1643                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1644                 return
1645
1646             pagenum = pagenum + 1
1647
1648
1649 class YahooSearchIE(InfoExtractor):
1650     """Information Extractor for Yahoo! Video search queries."""
1651
1652     _WORKING = False
1653     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1654     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1655     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1656     _MORE_PAGES_INDICATOR = r'\s*Next'
1657     _max_yahoo_results = 1000
1658     IE_NAME = u'video.yahoo:search'
1659
1660     def __init__(self, downloader=None):
1661         InfoExtractor.__init__(self, downloader)
1662
1663     def report_download_page(self, query, pagenum):
1664         """Report attempt to download playlist page with given number."""
1665         query = query.decode(preferredencoding())
1666         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1667
1668     def _real_extract(self, query):
1669         mobj = re.match(self._VALID_URL, query)
1670         if mobj is None:
1671             self._downloader.report_error(u'invalid search query "%s"' % query)
1672             return
1673
1674         prefix, query = query.split(':')
1675         prefix = prefix[8:]
1676         query = query.encode('utf-8')
1677         if prefix == '':
1678             self._download_n_results(query, 1)
1679             return
1680         elif prefix == 'all':
1681             self._download_n_results(query, self._max_yahoo_results)
1682             return
1683         else:
1684             try:
1685                 n = int(prefix)
1686                 if n <= 0:
1687                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1688                     return
1689                 elif n > self._max_yahoo_results:
1690                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1691                     n = self._max_yahoo_results
1692                 self._download_n_results(query, n)
1693                 return
1694             except ValueError: # parsing prefix as integer fails
1695                 self._download_n_results(query, 1)
1696                 return
1697
1698     def _download_n_results(self, query, n):
1699         """Downloads a specified number of results for a query"""
1700
1701         video_ids = []
1702         already_seen = set()
1703         pagenum = 1
1704
1705         while True:
1706             self.report_download_page(query, pagenum)
1707             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1708             request = compat_urllib_request.Request(result_url)
1709             try:
1710                 page = compat_urllib_request.urlopen(request).read()
1711             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1713                 return
1714
1715             # Extract video identifiers
1716             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1717                 video_id = mobj.group(1)
1718                 if video_id not in already_seen:
1719                     video_ids.append(video_id)
1720                     already_seen.add(video_id)
1721                     if len(video_ids) == n:
1722                         # Specified n videos reached
1723                         for id in video_ids:
1724                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1725                         return
1726
1727             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1728                 for id in video_ids:
1729                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1730                 return
1731
1732             pagenum = pagenum + 1
1733
1734
1735 class YoutubePlaylistIE(InfoExtractor):
1736     """Information Extractor for YouTube playlists."""
1737
1738     _VALID_URL = r"""(?:
1739                         (?:https?://)?
1740                         (?:\w+\.)?
1741                         youtube\.com/
1742                         (?:
1743                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1744                            \? (?:.*?&)*? (?:p|a|list)=
1745                         |  p/
1746                         )
1747                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1748                         .*
1749                      |
1750                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1751                      )"""
1752     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1753     _MAX_RESULTS = 50
1754     IE_NAME = u'youtube:playlist'
1755
1756     def __init__(self, downloader=None):
1757         InfoExtractor.__init__(self, downloader)
1758
1759     @classmethod
1760     def suitable(cls, url):
1761         """Receives a URL and returns True if suitable for this IE."""
1762         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1763
1764     def report_download_page(self, playlist_id, pagenum):
1765         """Report attempt to download playlist page with given number."""
1766         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1767
1768     def _real_extract(self, url):
1769         # Extract playlist id
1770         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1771         if mobj is None:
1772             self._downloader.report_error(u'invalid url: %s' % url)
1773             return
1774
1775         # Download playlist videos from API
1776         playlist_id = mobj.group(1) or mobj.group(2)
1777         page_num = 1
1778         videos = []
1779
1780         while True:
1781             self.report_download_page(playlist_id, page_num)
1782
1783             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1784             try:
1785                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1786             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1788                 return
1789
1790             try:
1791                 response = json.loads(page)
1792             except ValueError as err:
1793                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1794                 return
1795
1796             if 'feed' not in response:
1797                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1798                 return
1799             if 'entry' not in response['feed']:
1800                 # Number of videos is a multiple of self._MAX_RESULTS
1801                 break
1802
1803             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1804                         for entry in response['feed']['entry']
1805                         if 'content' in entry ]
1806
1807             if len(response['feed']['entry']) < self._MAX_RESULTS:
1808                 break
1809             page_num += 1
1810
1811         videos = [v[1] for v in sorted(videos)]
1812
1813         url_results = [self.url_result(url) for url in videos]
1814         return [self.playlist_result(url_results, playlist_id)]
1815
1816
1817 class YoutubeChannelIE(InfoExtractor):
1818     """Information Extractor for YouTube channels."""
1819
1820     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1821     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1822     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1823     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1824     IE_NAME = u'youtube:channel'
1825
1826     def report_download_page(self, channel_id, pagenum):
1827         """Report attempt to download channel page with given number."""
1828         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1829
1830     def extract_videos_from_page(self, page):
1831         ids_in_page = []
1832         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1833             if mobj.group(1) not in ids_in_page:
1834                 ids_in_page.append(mobj.group(1))
1835         return ids_in_page
1836
1837     def _real_extract(self, url):
1838         # Extract channel id
1839         mobj = re.match(self._VALID_URL, url)
1840         if mobj is None:
1841             self._downloader.report_error(u'invalid url: %s' % url)
1842             return
1843
1844         # Download channel page
1845         channel_id = mobj.group(1)
1846         video_ids = []
1847         pagenum = 1
1848
1849         self.report_download_page(channel_id, pagenum)
1850         url = self._TEMPLATE_URL % (channel_id, pagenum)
1851         request = compat_urllib_request.Request(url)
1852         try:
1853             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1854         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1856             return
1857
1858         # Extract video identifiers
1859         ids_in_page = self.extract_videos_from_page(page)
1860         video_ids.extend(ids_in_page)
1861
1862         # Download any subsequent channel pages using the json-based channel_ajax query
1863         if self._MORE_PAGES_INDICATOR in page:
1864             while True:
1865                 pagenum = pagenum + 1
1866
1867                 self.report_download_page(channel_id, pagenum)
1868                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1869                 request = compat_urllib_request.Request(url)
1870                 try:
1871                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1872                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1873                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1874                     return
1875
1876                 page = json.loads(page)
1877
1878                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1879                 video_ids.extend(ids_in_page)
1880
1881                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1882                     break
1883
1884         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1885
1886         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1887         url_entries = [self.url_result(url) for url in urls]
1888         return [self.playlist_result(url_entries, channel_id)]
1889
1890
1891 class YoutubeUserIE(InfoExtractor):
1892     """Information Extractor for YouTube users."""
1893
1894     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1895     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1896     _GDATA_PAGE_SIZE = 50
1897     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1898     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1899     IE_NAME = u'youtube:user'
1900
1901     def __init__(self, downloader=None):
1902         InfoExtractor.__init__(self, downloader)
1903
1904     def report_download_page(self, username, start_index):
1905         """Report attempt to download user page."""
1906         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1907                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1908
1909     def _real_extract(self, url):
1910         # Extract username
1911         mobj = re.match(self._VALID_URL, url)
1912         if mobj is None:
1913             self._downloader.report_error(u'invalid url: %s' % url)
1914             return
1915
1916         username = mobj.group(1)
1917
1918         # Download video ids using YouTube Data API. Result size per
1919         # query is limited (currently to 50 videos) so we need to query
1920         # page by page until there are no video ids - it means we got
1921         # all of them.
1922
1923         video_ids = []
1924         pagenum = 0
1925
1926         while True:
1927             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1928             self.report_download_page(username, start_index)
1929
1930             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1931
1932             try:
1933                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1934             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1935                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1936                 return
1937
1938             # Extract video identifiers
1939             ids_in_page = []
1940
1941             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1942                 if mobj.group(1) not in ids_in_page:
1943                     ids_in_page.append(mobj.group(1))
1944
1945             video_ids.extend(ids_in_page)
1946
1947             # A little optimization - if current page is not
1948             # "full", ie. does not contain PAGE_SIZE video ids then
1949             # we can assume that this page is the last one - there
1950             # are no more ids on further pages - no need to query
1951             # again.
1952
1953             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1954                 break
1955
1956             pagenum += 1
1957
1958         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1959         url_results = [self.url_result(url) for url in urls]
1960         return [self.playlist_result(url_results, playlist_title = username)]
1961
1962
1963 class BlipTVUserIE(InfoExtractor):
1964     """Information Extractor for blip.tv users."""
1965
1966     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1967     _PAGE_SIZE = 12
1968     IE_NAME = u'blip.tv:user'
1969
1970     def __init__(self, downloader=None):
1971         InfoExtractor.__init__(self, downloader)
1972
1973     def report_download_page(self, username, pagenum):
1974         """Report attempt to download user page."""
1975         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1976                 (self.IE_NAME, username, pagenum))
1977
1978     def _real_extract(self, url):
1979         # Extract username
1980         mobj = re.match(self._VALID_URL, url)
1981         if mobj is None:
1982             self._downloader.report_error(u'invalid url: %s' % url)
1983             return
1984
1985         username = mobj.group(1)
1986
1987         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1988
1989         request = compat_urllib_request.Request(url)
1990
1991         try:
1992             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1993             mobj = re.search(r'data-users-id="([^"]+)"', page)
1994             page_base = page_base % mobj.group(1)
1995         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1997             return
1998
1999
2000         # Download video ids using BlipTV Ajax calls. Result size per
2001         # query is limited (currently to 12 videos) so we need to query
2002         # page by page until there are no video ids - it means we got
2003         # all of them.
2004
2005         video_ids = []
2006         pagenum = 1
2007
2008         while True:
2009             self.report_download_page(username, pagenum)
2010             url = page_base + "&page=" + str(pagenum)
2011             request = compat_urllib_request.Request( url )
2012             try:
2013                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2014             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2015                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2016                 return
2017
2018             # Extract video identifiers
2019             ids_in_page = []
2020
2021             for mobj in re.finditer(r'href="/([^"]+)"', page):
2022                 if mobj.group(1) not in ids_in_page:
2023                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2024
2025             video_ids.extend(ids_in_page)
2026
2027             # A little optimization - if current page is not
2028             # "full", ie. does not contain PAGE_SIZE video ids then
2029             # we can assume that this page is the last one - there
2030             # are no more ids on further pages - no need to query
2031             # again.
2032
2033             if len(ids_in_page) < self._PAGE_SIZE:
2034                 break
2035
2036             pagenum += 1
2037
2038         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2039                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2040
2041         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2042         url_entries = [self.url_result(url) for url in urls]
2043         return [self.playlist_result(url_entries, playlist_title = username)]
2044
2045
2046 class DepositFilesIE(InfoExtractor):
2047     """Information extractor for depositfiles.com"""
2048
2049     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2050
2051     def report_download_webpage(self, file_id):
2052         """Report webpage download."""
2053         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2054
2055     def report_extraction(self, file_id):
2056         """Report information extraction."""
2057         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2058
2059     def _real_extract(self, url):
2060         file_id = url.split('/')[-1]
2061         # Rebuild url in english locale
2062         url = 'http://depositfiles.com/en/files/' + file_id
2063
2064         # Retrieve file webpage with 'Free download' button pressed
2065         free_download_indication = { 'gateway_result' : '1' }
2066         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2067         try:
2068             self.report_download_webpage(file_id)
2069             webpage = compat_urllib_request.urlopen(request).read()
2070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2072             return
2073
2074         # Search for the real file URL
2075         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2076         if (mobj is None) or (mobj.group(1) is None):
2077             # Try to figure out reason of the error.
2078             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2079             if (mobj is not None) and (mobj.group(1) is not None):
2080                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2081                 self._downloader.report_error(u'%s' % restriction_message)
2082             else:
2083                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2084             return
2085
2086         file_url = mobj.group(1)
2087         file_extension = os.path.splitext(file_url)[1][1:]
2088
2089         # Search for file title
2090         mobj = re.search(r'<b title="(.*?)">', webpage)
2091         if mobj is None:
2092             self._downloader.report_error(u'unable to extract title')
2093             return
2094         file_title = mobj.group(1).decode('utf-8')
2095
2096         return [{
2097             'id':       file_id.decode('utf-8'),
2098             'url':      file_url.decode('utf-8'),
2099             'uploader': None,
2100             'upload_date':  None,
2101             'title':    file_title,
2102             'ext':      file_extension.decode('utf-8'),
2103         }]
2104
2105
2106 class FacebookIE(InfoExtractor):
2107     """Information Extractor for Facebook"""
2108
2109     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2110     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2111     _NETRC_MACHINE = 'facebook'
2112     IE_NAME = u'facebook'
2113
2114     def report_login(self):
2115         """Report attempt to log in."""
2116         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2117
2118     def _real_initialize(self):
2119         if self._downloader is None:
2120             return
2121
2122         useremail = None
2123         password = None
2124         downloader_params = self._downloader.params
2125
2126         # Attempt to use provided username and password or .netrc data
2127         if downloader_params.get('username', None) is not None:
2128             useremail = downloader_params['username']
2129             password = downloader_params['password']
2130         elif downloader_params.get('usenetrc', False):
2131             try:
2132                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2133                 if info is not None:
2134                     useremail = info[0]
2135                     password = info[2]
2136                 else:
2137                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2138             except (IOError, netrc.NetrcParseError) as err:
2139                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2140                 return
2141
2142         if useremail is None:
2143             return
2144
2145         # Log in
2146         login_form = {
2147             'email': useremail,
2148             'pass': password,
2149             'login': 'Log+In'
2150             }
2151         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2152         try:
2153             self.report_login()
2154             login_results = compat_urllib_request.urlopen(request).read()
2155             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2156                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2157                 return
2158         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2159             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2160             return
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._downloader.report_error(u'invalid URL: %s' % url)
2166             return
2167         video_id = mobj.group('ID')
2168
2169         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2170         webpage = self._download_webpage(url, video_id)
2171
2172         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2173         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2174         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2175         if not m:
2176             raise ExtractorError(u'Cannot parse data')
2177         data = dict(json.loads(m.group(1)))
2178         params_raw = compat_urllib_parse.unquote(data['params'])
2179         params = json.loads(params_raw)
2180         video_data = params['video_data'][0]
2181         video_url = video_data.get('hd_src')
2182         if not video_url:
2183             video_url = video_data['sd_src']
2184         if not video_url:
2185             raise ExtractorError(u'Cannot find video URL')
2186         video_duration = int(video_data['video_duration'])
2187         thumbnail = video_data['thumbnail_src']
2188
2189         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2190         if not m:
2191             raise ExtractorError(u'Cannot find title in webpage')
2192         video_title = unescapeHTML(m.group(1))
2193
2194         info = {
2195             'id': video_id,
2196             'title': video_title,
2197             'url': video_url,
2198             'ext': 'mp4',
2199             'duration': video_duration,
2200             'thumbnail': thumbnail,
2201         }
2202         return [info]
2203
2204
2205 class BlipTVIE(InfoExtractor):
2206     """Information extractor for blip.tv"""
2207
2208     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2209     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2210     IE_NAME = u'blip.tv'
2211
2212     def report_extraction(self, file_id):
2213         """Report information extraction."""
2214         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2215
2216     def report_direct_download(self, title):
2217         """Report information extraction."""
2218         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2219
2220     def _real_extract(self, url):
2221         mobj = re.match(self._VALID_URL, url)
2222         if mobj is None:
2223             self._downloader.report_error(u'invalid URL: %s' % url)
2224             return
2225
2226         urlp = compat_urllib_parse_urlparse(url)
2227         if urlp.path.startswith('/play/'):
2228             request = compat_urllib_request.Request(url)
2229             response = compat_urllib_request.urlopen(request)
2230             redirecturl = response.geturl()
2231             rurlp = compat_urllib_parse_urlparse(redirecturl)
2232             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2233             url = 'http://blip.tv/a/a-' + file_id
2234             return self._real_extract(url)
2235
2236
2237         if '?' in url:
2238             cchar = '&'
2239         else:
2240             cchar = '?'
2241         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2242         request = compat_urllib_request.Request(json_url)
2243         request.add_header('User-Agent', 'iTunes/10.6.1')
2244         self.report_extraction(mobj.group(1))
2245         info = None
2246         try:
2247             urlh = compat_urllib_request.urlopen(request)
2248             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2249                 basename = url.split('/')[-1]
2250                 title,ext = os.path.splitext(basename)
2251                 title = title.decode('UTF-8')
2252                 ext = ext.replace('.', '')
2253                 self.report_direct_download(title)
2254                 info = {
2255                     'id': title,
2256                     'url': url,
2257                     'uploader': None,
2258                     'upload_date': None,
2259                     'title': title,
2260                     'ext': ext,
2261                     'urlhandle': urlh
2262                 }
2263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2264             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2265         if info is None: # Regular URL
2266             try:
2267                 json_code_bytes = urlh.read()
2268                 json_code = json_code_bytes.decode('utf-8')
2269             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2270                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2271                 return
2272
2273             try:
2274                 json_data = json.loads(json_code)
2275                 if 'Post' in json_data:
2276                     data = json_data['Post']
2277                 else:
2278                     data = json_data
2279
2280                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2281                 video_url = data['media']['url']
2282                 umobj = re.match(self._URL_EXT, video_url)
2283                 if umobj is None:
2284                     raise ValueError('Can not determine filename extension')
2285                 ext = umobj.group(1)
2286
2287                 info = {
2288                     'id': data['item_id'],
2289                     'url': video_url,
2290                     'uploader': data['display_name'],
2291                     'upload_date': upload_date,
2292                     'title': data['title'],
2293                     'ext': ext,
2294                     'format': data['media']['mimeType'],
2295                     'thumbnail': data['thumbnailUrl'],
2296                     'description': data['description'],
2297                     'player_url': data['embedUrl'],
2298                     'user_agent': 'iTunes/10.6.1',
2299                 }
2300             except (ValueError,KeyError) as err:
2301                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2302                 return
2303
2304         return [info]
2305
2306
2307 class MyVideoIE(InfoExtractor):
2308     """Information Extractor for myvideo.de."""
2309
2310     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2311     IE_NAME = u'myvideo'
2312
2313     def __init__(self, downloader=None):
2314         InfoExtractor.__init__(self, downloader)
2315
2316     def report_extraction(self, video_id):
2317         """Report information extraction."""
2318         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2319
2320     def _real_extract(self,url):
2321         mobj = re.match(self._VALID_URL, url)
2322         if mobj is None:
2323             self._download.report_error(u'invalid URL: %s' % url)
2324             return
2325
2326         video_id = mobj.group(1)
2327
2328         # Get video webpage
2329         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2330         webpage = self._download_webpage(webpage_url, video_id)
2331
2332         self.report_extraction(video_id)
2333         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2334                  webpage)
2335         if mobj is None:
2336             self._downloader.report_error(u'unable to extract media URL')
2337             return
2338         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2339
2340         mobj = re.search('<title>([^<]+)</title>', webpage)
2341         if mobj is None:
2342             self._downloader.report_error(u'unable to extract title')
2343             return
2344
2345         video_title = mobj.group(1)
2346
2347         return [{
2348             'id':       video_id,
2349             'url':      video_url,
2350             'uploader': None,
2351             'upload_date':  None,
2352             'title':    video_title,
2353             'ext':      u'flv',
2354         }]
2355
2356 class ComedyCentralIE(InfoExtractor):
2357     """Information extractor for The Daily Show and Colbert Report """
2358
2359     # urls can be abbreviations like :thedailyshow or :colbert
2360     # urls for episodes like:
2361     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2362     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2363     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2364     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2365                       |(https?://)?(www\.)?
2366                           (?P<showname>thedailyshow|colbertnation)\.com/
2367                          (full-episodes/(?P<episode>.*)|
2368                           (?P<clip>
2369                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2370                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2371                      $"""
2372
2373     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2374
2375     _video_extensions = {
2376         '3500': 'mp4',
2377         '2200': 'mp4',
2378         '1700': 'mp4',
2379         '1200': 'mp4',
2380         '750': 'mp4',
2381         '400': 'mp4',
2382     }
2383     _video_dimensions = {
2384         '3500': '1280x720',
2385         '2200': '960x540',
2386         '1700': '768x432',
2387         '1200': '640x360',
2388         '750': '512x288',
2389         '400': '384x216',
2390     }
2391
2392     @classmethod
2393     def suitable(cls, url):
2394         """Receives a URL and returns True if suitable for this IE."""
2395         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2396
2397     def report_extraction(self, episode_id):
2398         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2399
2400     def report_config_download(self, episode_id, media_id):
2401         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2402
2403     def report_index_download(self, episode_id):
2404         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2405
2406     def _print_formats(self, formats):
2407         print('Available formats:')
2408         for x in formats:
2409             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2410
2411
2412     def _real_extract(self, url):
2413         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414         if mobj is None:
2415             self._downloader.report_error(u'invalid URL: %s' % url)
2416             return
2417
2418         if mobj.group('shortname'):
2419             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2420                 url = u'http://www.thedailyshow.com/full-episodes/'
2421             else:
2422                 url = u'http://www.colbertnation.com/full-episodes/'
2423             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2424             assert mobj is not None
2425
2426         if mobj.group('clip'):
2427             if mobj.group('showname') == 'thedailyshow':
2428                 epTitle = mobj.group('tdstitle')
2429             else:
2430                 epTitle = mobj.group('cntitle')
2431             dlNewest = False
2432         else:
2433             dlNewest = not mobj.group('episode')
2434             if dlNewest:
2435                 epTitle = mobj.group('showname')
2436             else:
2437                 epTitle = mobj.group('episode')
2438
2439         req = compat_urllib_request.Request(url)
2440         self.report_extraction(epTitle)
2441         try:
2442             htmlHandle = compat_urllib_request.urlopen(req)
2443             html = htmlHandle.read()
2444             webpage = html.decode('utf-8')
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2447             return
2448         if dlNewest:
2449             url = htmlHandle.geturl()
2450             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2451             if mobj is None:
2452                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2453                 return
2454             if mobj.group('episode') == '':
2455                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2456                 return
2457             epTitle = mobj.group('episode')
2458
2459         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2460
2461         if len(mMovieParams) == 0:
2462             # The Colbert Report embeds the information in a without
2463             # a URL prefix; so extract the alternate reference
2464             # and then add the URL prefix manually.
2465
2466             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2467             if len(altMovieParams) == 0:
2468                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2469                 return
2470             else:
2471                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2472
2473         uri = mMovieParams[0][1]
2474         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2475         self.report_index_download(epTitle)
2476         try:
2477             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2478         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2479             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2480             return
2481
2482         results = []
2483
2484         idoc = xml.etree.ElementTree.fromstring(indexXml)
2485         itemEls = idoc.findall('.//item')
2486         for partNum,itemEl in enumerate(itemEls):
2487             mediaId = itemEl.findall('./guid')[0].text
2488             shortMediaId = mediaId.split(':')[-1]
2489             showId = mediaId.split(':')[-2].replace('.com', '')
2490             officialTitle = itemEl.findall('./title')[0].text
2491             officialDate = itemEl.findall('./pubDate')[0].text
2492
2493             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2494                         compat_urllib_parse.urlencode({'uri': mediaId}))
2495             configReq = compat_urllib_request.Request(configUrl)
2496             self.report_config_download(epTitle, shortMediaId)
2497             try:
2498                 configXml = compat_urllib_request.urlopen(configReq).read()
2499             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2500                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2501                 return
2502
2503             cdoc = xml.etree.ElementTree.fromstring(configXml)
2504             turls = []
2505             for rendition in cdoc.findall('.//rendition'):
2506                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2507                 turls.append(finfo)
2508
2509             if len(turls) == 0:
2510                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2511                 continue
2512
2513             if self._downloader.params.get('listformats', None):
2514                 self._print_formats([i[0] for i in turls])
2515                 return
2516
2517             # For now, just pick the highest bitrate
2518             format,rtmp_video_url = turls[-1]
2519
2520             # Get the format arg from the arg stream
2521             req_format = self._downloader.params.get('format', None)
2522
2523             # Select format if we can find one
2524             for f,v in turls:
2525                 if f == req_format:
2526                     format, rtmp_video_url = f, v
2527                     break
2528
2529             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2530             if not m:
2531                 raise ExtractorError(u'Cannot transform RTMP url')
2532             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2533             video_url = base + m.group('finalid')
2534
2535             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2536             info = {
2537                 'id': shortMediaId,
2538                 'url': video_url,
2539                 'uploader': showId,
2540                 'upload_date': officialDate,
2541                 'title': effTitle,
2542                 'ext': 'mp4',
2543                 'format': format,
2544                 'thumbnail': None,
2545                 'description': officialTitle,
2546             }
2547             results.append(info)
2548
2549         return results
2550
2551
2552 class EscapistIE(InfoExtractor):
2553     """Information extractor for The Escapist """
2554
2555     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2556     IE_NAME = u'escapist'
2557
2558     def report_extraction(self, showName):
2559         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2560
2561     def report_config_download(self, showName):
2562         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2563
2564     def _real_extract(self, url):
2565         mobj = re.match(self._VALID_URL, url)
2566         if mobj is None:
2567             self._downloader.report_error(u'invalid URL: %s' % url)
2568             return
2569         showName = mobj.group('showname')
2570         videoId = mobj.group('episode')
2571
2572         self.report_extraction(showName)
2573         try:
2574             webPage = compat_urllib_request.urlopen(url)
2575             webPageBytes = webPage.read()
2576             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2577             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2578         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2580             return
2581
2582         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2583         description = unescapeHTML(descMatch.group(1))
2584         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2585         imgUrl = unescapeHTML(imgMatch.group(1))
2586         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2587         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2588         configUrlMatch = re.search('config=(.*)$', playerUrl)
2589         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2590
2591         self.report_config_download(showName)
2592         try:
2593             configJSON = compat_urllib_request.urlopen(configUrl)
2594             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2595             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2596         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2598             return
2599
2600         # Technically, it's JavaScript, not JSON
2601         configJSON = configJSON.replace("'", '"')
2602
2603         try:
2604             config = json.loads(configJSON)
2605         except (ValueError,) as err:
2606             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2607             return
2608
2609         playlist = config['playlist']
2610         videoUrl = playlist[1]['url']
2611
2612         info = {
2613             'id': videoId,
2614             'url': videoUrl,
2615             'uploader': showName,
2616             'upload_date': None,
2617             'title': showName,
2618             'ext': 'mp4',
2619             'thumbnail': imgUrl,
2620             'description': description,
2621             'player_url': playerUrl,
2622         }
2623
2624         return [info]
2625
2626 class CollegeHumorIE(InfoExtractor):
2627     """Information extractor for collegehumor.com"""
2628
2629     _WORKING = False
2630     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2631     IE_NAME = u'collegehumor'
2632
2633     def report_manifest(self, video_id):
2634         """Report information extraction."""
2635         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2636
2637     def report_extraction(self, video_id):
2638         """Report information extraction."""
2639         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2640
2641     def _real_extract(self, url):
2642         mobj = re.match(self._VALID_URL, url)
2643         if mobj is None:
2644             self._downloader.report_error(u'invalid URL: %s' % url)
2645             return
2646         video_id = mobj.group('videoid')
2647
2648         info = {
2649             'id': video_id,
2650             'uploader': None,
2651             'upload_date': None,
2652         }
2653
2654         self.report_extraction(video_id)
2655         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2656         try:
2657             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2658         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2660             return
2661
2662         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2663         try:
2664             videoNode = mdoc.findall('./video')[0]
2665             info['description'] = videoNode.findall('./description')[0].text
2666             info['title'] = videoNode.findall('./caption')[0].text
2667             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2668             manifest_url = videoNode.findall('./file')[0].text
2669         except IndexError:
2670             self._downloader.report_error(u'Invalid metadata XML file')
2671             return
2672
2673         manifest_url += '?hdcore=2.10.3'
2674         self.report_manifest(video_id)
2675         try:
2676             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2677         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2678             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2679             return
2680
2681         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2682         try:
2683             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2684             node_id = media_node.attrib['url']
2685             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2686         except IndexError as err:
2687             self._downloader.report_error(u'Invalid manifest file')
2688             return
2689
2690         url_pr = compat_urllib_parse_urlparse(manifest_url)
2691         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2692
2693         info['url'] = url
2694         info['ext'] = 'f4f'
2695         return [info]
2696
2697
2698 class XVideosIE(InfoExtractor):
2699     """Information extractor for xvideos.com"""
2700
2701     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2702     IE_NAME = u'xvideos'
2703
2704     def report_extraction(self, video_id):
2705         """Report information extraction."""
2706         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             self._downloader.report_error(u'invalid URL: %s' % url)
2712             return
2713         video_id = mobj.group(1)
2714
2715         webpage = self._download_webpage(url, video_id)
2716
2717         self.report_extraction(video_id)
2718
2719
2720         # Extract video URL
2721         mobj = re.search(r'flv_url=(.+?)&', webpage)
2722         if mobj is None:
2723             self._downloader.report_error(u'unable to extract video url')
2724             return
2725         video_url = compat_urllib_parse.unquote(mobj.group(1))
2726
2727
2728         # Extract title
2729         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2730         if mobj is None:
2731             self._downloader.report_error(u'unable to extract video title')
2732             return
2733         video_title = mobj.group(1)
2734
2735
2736         # Extract video thumbnail
2737         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2738         if mobj is None:
2739             self._downloader.report_error(u'unable to extract video thumbnail')
2740             return
2741         video_thumbnail = mobj.group(0)
2742
2743         info = {
2744             'id': video_id,
2745             'url': video_url,
2746             'uploader': None,
2747             'upload_date': None,
2748             'title': video_title,
2749             'ext': 'flv',
2750             'thumbnail': video_thumbnail,
2751             'description': None,
2752         }
2753
2754         return [info]
2755
2756
2757 class SoundcloudIE(InfoExtractor):
2758     """Information extractor for soundcloud.com
2759        To access the media, the uid of the song and a stream token
2760        must be extracted from the page source and the script must make
2761        a request to media.soundcloud.com/crossdomain.xml. Then
2762        the media can be grabbed by requesting from an url composed
2763        of the stream token and uid
2764      """
2765
2766     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2767     IE_NAME = u'soundcloud'
2768
2769     def __init__(self, downloader=None):
2770         InfoExtractor.__init__(self, downloader)
2771
2772     def report_resolve(self, video_id):
2773         """Report information extraction."""
2774         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2775
2776     def report_extraction(self, video_id):
2777         """Report information extraction."""
2778         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2779
2780     def _real_extract(self, url):
2781         mobj = re.match(self._VALID_URL, url)
2782         if mobj is None:
2783             self._downloader.report_error(u'invalid URL: %s' % url)
2784             return
2785
2786         # extract uploader (which is in the url)
2787         uploader = mobj.group(1)
2788         # extract simple title (uploader + slug of song title)
2789         slug_title =  mobj.group(2)
2790         simple_title = uploader + u'-' + slug_title
2791
2792         self.report_resolve('%s/%s' % (uploader, slug_title))
2793
2794         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2795         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2796         request = compat_urllib_request.Request(resolv_url)
2797         try:
2798             info_json_bytes = compat_urllib_request.urlopen(request).read()
2799             info_json = info_json_bytes.decode('utf-8')
2800         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2802             return
2803
2804         info = json.loads(info_json)
2805         video_id = info['id']
2806         self.report_extraction('%s/%s' % (uploader, slug_title))
2807
2808         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2809         request = compat_urllib_request.Request(streams_url)
2810         try:
2811             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2812             stream_json = stream_json_bytes.decode('utf-8')
2813         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2814             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2815             return
2816
2817         streams = json.loads(stream_json)
2818         mediaURL = streams['http_mp3_128_url']
2819
2820         return [{
2821             'id':       info['id'],
2822             'url':      mediaURL,
2823             'uploader': info['user']['username'],
2824             'upload_date':  info['created_at'],
2825             'title':    info['title'],
2826             'ext':      u'mp3',
2827             'description': info['description'],
2828         }]
2829
2830 class SoundcloudSetIE(InfoExtractor):
2831     """Information extractor for soundcloud.com sets
2832        To access the media, the uid of the song and a stream token
2833        must be extracted from the page source and the script must make
2834        a request to media.soundcloud.com/crossdomain.xml. Then
2835        the media can be grabbed by requesting from an url composed
2836        of the stream token and uid
2837      """
2838
2839     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2840     IE_NAME = u'soundcloud'
2841
2842     def __init__(self, downloader=None):
2843         InfoExtractor.__init__(self, downloader)
2844
2845     def report_resolve(self, video_id):
2846         """Report information extraction."""
2847         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2848
2849     def report_extraction(self, video_id):
2850         """Report information extraction."""
2851         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2852
2853     def _real_extract(self, url):
2854         mobj = re.match(self._VALID_URL, url)
2855         if mobj is None:
2856             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2857             return
2858
2859         # extract uploader (which is in the url)
2860         uploader = mobj.group(1)
2861         # extract simple title (uploader + slug of song title)
2862         slug_title =  mobj.group(2)
2863         simple_title = uploader + u'-' + slug_title
2864
2865         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2866
2867         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2868         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2869         request = compat_urllib_request.Request(resolv_url)
2870         try:
2871             info_json_bytes = compat_urllib_request.urlopen(request).read()
2872             info_json = info_json_bytes.decode('utf-8')
2873         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2874             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2875             return
2876
2877         videos = []
2878         info = json.loads(info_json)
2879         if 'errors' in info:
2880             for err in info['errors']:
2881                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2882             return
2883
2884         for track in info['tracks']:
2885             video_id = track['id']
2886             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2887
2888             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2889             request = compat_urllib_request.Request(streams_url)
2890             try:
2891                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2892                 stream_json = stream_json_bytes.decode('utf-8')
2893             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2895                 return
2896
2897             streams = json.loads(stream_json)
2898             mediaURL = streams['http_mp3_128_url']
2899
2900             videos.append({
2901                 'id':       video_id,
2902                 'url':      mediaURL,
2903                 'uploader': track['user']['username'],
2904                 'upload_date':  track['created_at'],
2905                 'title':    track['title'],
2906                 'ext':      u'mp3',
2907                 'description': track['description'],
2908             })
2909         return videos
2910
2911
2912 class InfoQIE(InfoExtractor):
2913     """Information extractor for infoq.com"""
2914     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2915
2916     def report_extraction(self, video_id):
2917         """Report information extraction."""
2918         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2919
2920     def _real_extract(self, url):
2921         mobj = re.match(self._VALID_URL, url)
2922         if mobj is None:
2923             self._downloader.report_error(u'invalid URL: %s' % url)
2924             return
2925
2926         webpage = self._download_webpage(url, video_id=url)
2927         self.report_extraction(url)
2928
2929         # Extract video URL
2930         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2931         if mobj is None:
2932             self._downloader.report_error(u'unable to extract video url')
2933             return
2934         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2935         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2936
2937         # Extract title
2938         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2939         if mobj is None:
2940             self._downloader.report_error(u'unable to extract video title')
2941             return
2942         video_title = mobj.group(1)
2943
2944         # Extract description
2945         video_description = u'No description available.'
2946         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2947         if mobj is not None:
2948             video_description = mobj.group(1)
2949
2950         video_filename = video_url.split('/')[-1]
2951         video_id, extension = video_filename.split('.')
2952
2953         info = {
2954             'id': video_id,
2955             'url': video_url,
2956             'uploader': None,
2957             'upload_date': None,
2958             'title': video_title,
2959             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2960             'thumbnail': None,
2961             'description': video_description,
2962         }
2963
2964         return [info]
2965
2966 class MixcloudIE(InfoExtractor):
2967     """Information extractor for www.mixcloud.com"""
2968
2969     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2970     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2971     IE_NAME = u'mixcloud'
2972
2973     def __init__(self, downloader=None):
2974         InfoExtractor.__init__(self, downloader)
2975
2976     def report_download_json(self, file_id):
2977         """Report JSON download."""
2978         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2979
2980     def report_extraction(self, file_id):
2981         """Report information extraction."""
2982         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2983
2984     def get_urls(self, jsonData, fmt, bitrate='best'):
2985         """Get urls from 'audio_formats' section in json"""
2986         file_url = None
2987         try:
2988             bitrate_list = jsonData[fmt]
2989             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2990                 bitrate = max(bitrate_list) # select highest
2991
2992             url_list = jsonData[fmt][bitrate]
2993         except TypeError: # we have no bitrate info.
2994             url_list = jsonData[fmt]
2995         return url_list
2996
2997     def check_urls(self, url_list):
2998         """Returns 1st active url from list"""
2999         for url in url_list:
3000             try:
3001                 compat_urllib_request.urlopen(url)
3002                 return url
3003             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004                 url = None
3005
3006         return None
3007
3008     def _print_formats(self, formats):
3009         print('Available formats:')
3010         for fmt in formats.keys():
3011             for b in formats[fmt]:
3012                 try:
3013                     ext = formats[fmt][b][0]
3014                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3015                 except TypeError: # we have no bitrate info
3016                     ext = formats[fmt][0]
3017                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3018                     break
3019
3020     def _real_extract(self, url):
3021         mobj = re.match(self._VALID_URL, url)
3022         if mobj is None:
3023             self._downloader.report_error(u'invalid URL: %s' % url)
3024             return
3025         # extract uploader & filename from url
3026         uploader = mobj.group(1).decode('utf-8')
3027         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3028
3029         # construct API request
3030         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3031         # retrieve .json file with links to files
3032         request = compat_urllib_request.Request(file_url)
3033         try:
3034             self.report_download_json(file_url)
3035             jsonData = compat_urllib_request.urlopen(request).read()
3036         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3038             return
3039
3040         # parse JSON
3041         json_data = json.loads(jsonData)
3042         player_url = json_data['player_swf_url']
3043         formats = dict(json_data['audio_formats'])
3044
3045         req_format = self._downloader.params.get('format', None)
3046         bitrate = None
3047
3048         if self._downloader.params.get('listformats', None):
3049             self._print_formats(formats)
3050             return
3051
3052         if req_format is None or req_format == 'best':
3053             for format_param in formats.keys():
3054                 url_list = self.get_urls(formats, format_param)
3055                 # check urls
3056                 file_url = self.check_urls(url_list)
3057                 if file_url is not None:
3058                     break # got it!
3059         else:
3060             if req_format not in formats:
3061                 self._downloader.report_error(u'format is not available')
3062                 return
3063
3064             url_list = self.get_urls(formats, req_format)
3065             file_url = self.check_urls(url_list)
3066             format_param = req_format
3067
3068         return [{
3069             'id': file_id.decode('utf-8'),
3070             'url': file_url.decode('utf-8'),
3071             'uploader': uploader.decode('utf-8'),
3072             'upload_date': None,
3073             'title': json_data['name'],
3074             'ext': file_url.split('.')[-1].decode('utf-8'),
3075             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3076             'thumbnail': json_data['thumbnail_url'],
3077             'description': json_data['description'],
3078             'player_url': player_url.decode('utf-8'),
3079         }]
3080
3081 class StanfordOpenClassroomIE(InfoExtractor):
3082     """Information extractor for Stanford's Open ClassRoom"""
3083
3084     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3085     IE_NAME = u'stanfordoc'
3086
3087     def report_download_webpage(self, objid):
3088         """Report information extraction."""
3089         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3090
3091     def report_extraction(self, video_id):
3092         """Report information extraction."""
3093         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3094
3095     def _real_extract(self, url):
3096         mobj = re.match(self._VALID_URL, url)
3097         if mobj is None:
3098             raise ExtractorError(u'Invalid URL: %s' % url)
3099
3100         if mobj.group('course') and mobj.group('video'): # A specific video
3101             course = mobj.group('course')
3102             video = mobj.group('video')
3103             info = {
3104                 'id': course + '_' + video,
3105                 'uploader': None,
3106                 'upload_date': None,
3107             }
3108
3109             self.report_extraction(info['id'])
3110             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3111             xmlUrl = baseUrl + video + '.xml'
3112             try:
3113                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3114             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3116                 return
3117             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3118             try:
3119                 info['title'] = mdoc.findall('./title')[0].text
3120                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3121             except IndexError:
3122                 self._downloader.report_error(u'Invalid metadata XML file')
3123                 return
3124             info['ext'] = info['url'].rpartition('.')[2]
3125             return [info]
3126         elif mobj.group('course'): # A course page
3127             course = mobj.group('course')
3128             info = {
3129                 'id': course,
3130                 'type': 'playlist',
3131                 'uploader': None,
3132                 'upload_date': None,
3133             }
3134
3135             coursepage = self._download_webpage(url, info['id'],
3136                                         note='Downloading course info page',
3137                                         errnote='Unable to download course info page')
3138
3139             m = re.search('<h1>([^<]+)</h1>', coursepage)
3140             if m:
3141                 info['title'] = unescapeHTML(m.group(1))
3142             else:
3143                 info['title'] = info['id']
3144
3145             m = re.search('<description>([^<]+)</description>', coursepage)
3146             if m:
3147                 info['description'] = unescapeHTML(m.group(1))
3148
3149             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3150             info['list'] = [
3151                 {
3152                     'type': 'reference',
3153                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3154                 }
3155                     for vpage in links]
3156             results = []
3157             for entry in info['list']:
3158                 assert entry['type'] == 'reference'
3159                 results += self.extract(entry['url'])
3160             return results
3161         else: # Root page
3162             info = {
3163                 'id': 'Stanford OpenClassroom',
3164                 'type': 'playlist',
3165                 'uploader': None,
3166                 'upload_date': None,
3167             }
3168
3169             self.report_download_webpage(info['id'])
3170             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3171             try:
3172                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3173             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3175                 return
3176
3177             info['title'] = info['id']
3178
3179             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3180             info['list'] = [
3181                 {
3182                     'type': 'reference',
3183                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3184                 }
3185                     for cpage in links]
3186
3187             results = []
3188             for entry in info['list']:
3189                 assert entry['type'] == 'reference'
3190                 results += self.extract(entry['url'])
3191             return results
3192
3193 class MTVIE(InfoExtractor):
3194     """Information extractor for MTV.com"""
3195
3196     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3197     IE_NAME = u'mtv'
3198
3199     def report_extraction(self, video_id):
3200         """Report information extraction."""
3201         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3202
3203     def _real_extract(self, url):
3204         mobj = re.match(self._VALID_URL, url)
3205         if mobj is None:
3206             self._downloader.report_error(u'invalid URL: %s' % url)
3207             return
3208         if not mobj.group('proto'):
3209             url = 'http://' + url
3210         video_id = mobj.group('videoid')
3211
3212         webpage = self._download_webpage(url, video_id)
3213
3214         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3215         if mobj is None:
3216             self._downloader.report_error(u'unable to extract song name')
3217             return
3218         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3220         if mobj is None:
3221             self._downloader.report_error(u'unable to extract performer')
3222             return
3223         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3224         video_title = performer + ' - ' + song_name
3225
3226         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3227         if mobj is None:
3228             self._downloader.report_error(u'unable to mtvn_uri')
3229             return
3230         mtvn_uri = mobj.group(1)
3231
3232         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3233         if mobj is None:
3234             self._downloader.report_error(u'unable to extract content id')
3235             return
3236         content_id = mobj.group(1)
3237
3238         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3239         self.report_extraction(video_id)
3240         request = compat_urllib_request.Request(videogen_url)
3241         try:
3242             metadataXml = compat_urllib_request.urlopen(request).read()
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3245             return
3246
3247         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3248         renditions = mdoc.findall('.//rendition')
3249
3250         # For now, always pick the highest quality.
3251         rendition = renditions[-1]
3252
3253         try:
3254             _,_,ext = rendition.attrib['type'].partition('/')
3255             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3256             video_url = rendition.find('./src').text
3257         except KeyError:
3258             self._downloader.trouble('Invalid rendition field.')
3259             return
3260
3261         info = {
3262             'id': video_id,
3263             'url': video_url,
3264             'uploader': performer,
3265             'upload_date': None,
3266             'title': video_title,
3267             'ext': ext,
3268             'format': format,
3269         }
3270
3271         return [info]
3272
3273
3274 class YoukuIE(InfoExtractor):
3275     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3276
3277     def report_download_webpage(self, file_id):
3278         """Report webpage download."""
3279         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3280
3281     def report_extraction(self, file_id):
3282         """Report information extraction."""
3283         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3284
3285     def _gen_sid(self):
3286         nowTime = int(time.time() * 1000)
3287         random1 = random.randint(1000,1998)
3288         random2 = random.randint(1000,9999)
3289
3290         return "%d%d%d" %(nowTime,random1,random2)
3291
3292     def _get_file_ID_mix_string(self, seed):
3293         mixed = []
3294         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3295         seed = float(seed)
3296         for i in range(len(source)):
3297             seed  =  (seed * 211 + 30031 ) % 65536
3298             index  =  math.floor(seed / 65536 * len(source) )
3299             mixed.append(source[int(index)])
3300             source.remove(source[int(index)])
3301         #return ''.join(mixed)
3302         return mixed
3303
3304     def _get_file_id(self, fileId, seed):
3305         mixed = self._get_file_ID_mix_string(seed)
3306         ids = fileId.split('*')
3307         realId = []
3308         for ch in ids:
3309             if ch:
3310                 realId.append(mixed[int(ch)])
3311         return ''.join(realId)
3312
3313     def _real_extract(self, url):
3314         mobj = re.match(self._VALID_URL, url)
3315         if mobj is None:
3316             self._downloader.report_error(u'invalid URL: %s' % url)
3317             return
3318         video_id = mobj.group('ID')
3319
3320         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3321
3322         request = compat_urllib_request.Request(info_url, None, std_headers)
3323         try:
3324             self.report_download_webpage(video_id)
3325             jsondata = compat_urllib_request.urlopen(request).read()
3326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3328             return
3329
3330         self.report_extraction(video_id)
3331         try:
3332             jsonstr = jsondata.decode('utf-8')
3333             config = json.loads(jsonstr)
3334
3335             video_title =  config['data'][0]['title']
3336             seed = config['data'][0]['seed']
3337
3338             format = self._downloader.params.get('format', None)
3339             supported_format = list(config['data'][0]['streamfileids'].keys())
3340
3341             if format is None or format == 'best':
3342                 if 'hd2' in supported_format:
3343                     format = 'hd2'
3344                 else:
3345                     format = 'flv'
3346                 ext = u'flv'
3347             elif format == 'worst':
3348                 format = 'mp4'
3349                 ext = u'mp4'
3350             else:
3351                 format = 'flv'
3352                 ext = u'flv'
3353
3354
3355             fileid = config['data'][0]['streamfileids'][format]
3356             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3357         except (UnicodeDecodeError, ValueError, KeyError):
3358             self._downloader.report_error(u'unable to extract info section')
3359             return
3360
3361         files_info=[]
3362         sid = self._gen_sid()
3363         fileid = self._get_file_id(fileid, seed)
3364
3365         #column 8,9 of fileid represent the segment number
3366         #fileid[7:9] should be changed
3367         for index, key in enumerate(keys):
3368
3369             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3370             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3371
3372             info = {
3373                 'id': '%s_part%02d' % (video_id, index),
3374                 'url': download_url,
3375                 'uploader': None,
3376                 'upload_date': None,
3377                 'title': video_title,
3378                 'ext': ext,
3379             }
3380             files_info.append(info)
3381
3382         return files_info
3383
3384
3385 class XNXXIE(InfoExtractor):
3386     """Information extractor for xnxx.com"""
3387
3388     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3389     IE_NAME = u'xnxx'
3390     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3391     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3392     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3393
3394     def report_webpage(self, video_id):
3395         """Report information extraction"""
3396         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3397
3398     def report_extraction(self, video_id):
3399         """Report information extraction"""
3400         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3401
3402     def _real_extract(self, url):
3403         mobj = re.match(self._VALID_URL, url)
3404         if mobj is None:
3405             self._downloader.report_error(u'invalid URL: %s' % url)
3406             return
3407         video_id = mobj.group(1)
3408
3409         self.report_webpage(video_id)
3410
3411         # Get webpage content
3412         try:
3413             webpage_bytes = compat_urllib_request.urlopen(url).read()
3414             webpage = webpage_bytes.decode('utf-8')
3415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3417             return
3418
3419         result = re.search(self.VIDEO_URL_RE, webpage)
3420         if result is None:
3421             self._downloader.report_error(u'unable to extract video url')
3422             return
3423         video_url = compat_urllib_parse.unquote(result.group(1))
3424
3425         result = re.search(self.VIDEO_TITLE_RE, webpage)
3426         if result is None:
3427             self._downloader.report_error(u'unable to extract video title')
3428             return
3429         video_title = result.group(1)
3430
3431         result = re.search(self.VIDEO_THUMB_RE, webpage)
3432         if result is None:
3433             self._downloader.report_error(u'unable to extract video thumbnail')
3434             return
3435         video_thumbnail = result.group(1)
3436
3437         return [{
3438             'id': video_id,
3439             'url': video_url,
3440             'uploader': None,
3441             'upload_date': None,
3442             'title': video_title,
3443             'ext': 'flv',
3444             'thumbnail': video_thumbnail,
3445             'description': None,
3446         }]
3447
3448
3449 class GooglePlusIE(InfoExtractor):
3450     """Information extractor for plus.google.com."""
3451
3452     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3453     IE_NAME = u'plus.google'
3454
3455     def __init__(self, downloader=None):
3456         InfoExtractor.__init__(self, downloader)
3457
3458     def report_extract_entry(self, url):
3459         """Report downloading extry"""
3460         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3461
3462     def report_date(self, upload_date):
3463         """Report downloading extry"""
3464         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3465
3466     def report_uploader(self, uploader):
3467         """Report downloading extry"""
3468         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3469
3470     def report_title(self, video_title):
3471         """Report downloading extry"""
3472         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3473
3474     def report_extract_vid_page(self, video_page):
3475         """Report information extraction."""
3476         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3477
3478     def _real_extract(self, url):
3479         # Extract id from URL
3480         mobj = re.match(self._VALID_URL, url)
3481         if mobj is None:
3482             self._downloader.report_error(u'Invalid URL: %s' % url)
3483             return
3484
3485         post_url = mobj.group(0)
3486         video_id = mobj.group(1)
3487
3488         video_extension = 'flv'
3489
3490         # Step 1, Retrieve post webpage to extract further information
3491         self.report_extract_entry(post_url)
3492         request = compat_urllib_request.Request(post_url)
3493         try:
3494             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3495         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3497             return
3498
3499         # Extract update date
3500         upload_date = None
3501         pattern = 'title="Timestamp">(.*?)</a>'
3502         mobj = re.search(pattern, webpage)
3503         if mobj:
3504             upload_date = mobj.group(1)
3505             # Convert timestring to a format suitable for filename
3506             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507             upload_date = upload_date.strftime('%Y%m%d')
3508         self.report_date(upload_date)
3509
3510         # Extract uploader
3511         uploader = None
3512         pattern = r'rel\="author".*?>(.*?)</a>'
3513         mobj = re.search(pattern, webpage)
3514         if mobj:
3515             uploader = mobj.group(1)
3516         self.report_uploader(uploader)
3517
3518         # Extract title
3519         # Get the first line for title
3520         video_title = u'NA'
3521         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522         mobj = re.search(pattern, webpage)
3523         if mobj:
3524             video_title = mobj.group(1)
3525         self.report_title(video_title)
3526
3527         # Step 2, Stimulate clicking the image box to launch video
3528         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529         mobj = re.search(pattern, webpage)
3530         if mobj is None:
3531             self._downloader.report_error(u'unable to extract video page URL')
3532
3533         video_page = mobj.group(1)
3534         request = compat_urllib_request.Request(video_page)
3535         try:
3536             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3537         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3539             return
3540         self.report_extract_vid_page(video_page)
3541
3542
3543         # Extract video links on video page
3544         """Extract video links of all sizes"""
3545         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546         mobj = re.findall(pattern, webpage)
3547         if len(mobj) == 0:
3548             self._downloader.report_error(u'unable to extract video links')
3549
3550         # Sort in resolution
3551         links = sorted(mobj)
3552
3553         # Choose the lowest of the sort, i.e. highest resolution
3554         video_url = links[-1]
3555         # Only get the url. The resolution part in the tuple has no use anymore
3556         video_url = video_url[-1]
3557         # Treat escaped \u0026 style hex
3558         try:
3559             video_url = video_url.decode("unicode_escape")
3560         except AttributeError: # Python 3
3561             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3562
3563
3564         return [{
3565             'id':       video_id,
3566             'url':      video_url,
3567             'uploader': uploader,
3568             'upload_date':  upload_date,
3569             'title':    video_title,
3570             'ext':      video_extension,
3571         }]
3572
3573 class NBAIE(InfoExtractor):
3574     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3575     IE_NAME = u'nba'
3576
3577     def _real_extract(self, url):
3578         mobj = re.match(self._VALID_URL, url)
3579         if mobj is None:
3580             self._downloader.report_error(u'invalid URL: %s' % url)
3581             return
3582
3583         video_id = mobj.group(1)
3584         if video_id.endswith('/index.html'):
3585             video_id = video_id[:-len('/index.html')]
3586
3587         webpage = self._download_webpage(url, video_id)
3588
3589         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3590         def _findProp(rexp, default=None):
3591             m = re.search(rexp, webpage)
3592             if m:
3593                 return unescapeHTML(m.group(1))
3594             else:
3595                 return default
3596
3597         shortened_video_id = video_id.rpartition('/')[2]
3598         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3599         info = {
3600             'id': shortened_video_id,
3601             'url': video_url,
3602             'ext': 'mp4',
3603             'title': title,
3604             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3605             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3606         }
3607         return [info]
3608
3609 class JustinTVIE(InfoExtractor):
3610     """Information extractor for justin.tv and twitch.tv"""
3611     # TODO: One broadcast may be split into multiple videos. The key
3612     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3613     # starts at 1 and increases. Can we treat all parts as one video?
3614
3615     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3616         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3617     _JUSTIN_PAGE_LIMIT = 100
3618     IE_NAME = u'justin.tv'
3619
3620     def report_extraction(self, file_id):
3621         """Report information extraction."""
3622         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3623
3624     def report_download_page(self, channel, offset):
3625         """Report attempt to download a single page of videos."""
3626         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3627                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3628
3629     # Return count of items, list of *valid* items
3630     def _parse_page(self, url):
3631         try:
3632             urlh = compat_urllib_request.urlopen(url)
3633             webpage_bytes = urlh.read()
3634             webpage = webpage_bytes.decode('utf-8', 'ignore')
3635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3636             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3637             return
3638
3639         response = json.loads(webpage)
3640         if type(response) != list:
3641             error_text = response.get('error', 'unknown error')
3642             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3643             return
3644         info = []
3645         for clip in response:
3646             video_url = clip['video_file_url']
3647             if video_url:
3648                 video_extension = os.path.splitext(video_url)[1][1:]
3649                 video_date = re.sub('-', '', clip['start_time'][:10])
3650                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3651                 video_id = clip['id']
3652                 video_title = clip.get('title', video_id)
3653                 info.append({
3654                     'id': video_id,
3655                     'url': video_url,
3656                     'title': video_title,
3657                     'uploader': clip.get('channel_name', video_uploader_id),
3658                     'uploader_id': video_uploader_id,
3659                     'upload_date': video_date,
3660                     'ext': video_extension,
3661                 })
3662         return (len(response), info)
3663
3664     def _real_extract(self, url):
3665         mobj = re.match(self._VALID_URL, url)
3666         if mobj is None:
3667             self._downloader.report_error(u'invalid URL: %s' % url)
3668             return
3669
3670         api = 'http://api.justin.tv'
3671         video_id = mobj.group(mobj.lastindex)
3672         paged = False
3673         if mobj.lastindex == 1:
3674             paged = True
3675             api += '/channel/archives/%s.json'
3676         else:
3677             api += '/broadcast/by_archive/%s.json'
3678         api = api % (video_id,)
3679
3680         self.report_extraction(video_id)
3681
3682         info = []
3683         offset = 0
3684         limit = self._JUSTIN_PAGE_LIMIT
3685         while True:
3686             if paged:
3687                 self.report_download_page(video_id, offset)
3688             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3689             page_count, page_info = self._parse_page(page_url)
3690             info.extend(page_info)
3691             if not paged or page_count != limit:
3692                 break
3693             offset += limit
3694         return info
3695
3696 class FunnyOrDieIE(InfoExtractor):
3697     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3698
3699     def _real_extract(self, url):
3700         mobj = re.match(self._VALID_URL, url)
3701         if mobj is None:
3702             self._downloader.report_error(u'invalid URL: %s' % url)
3703             return
3704
3705         video_id = mobj.group('id')
3706         webpage = self._download_webpage(url, video_id)
3707
3708         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3709         if not m:
3710             self._downloader.report_error(u'unable to find video information')
3711         video_url = unescapeHTML(m.group('url'))
3712
3713         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3714         if not m:
3715             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3716             if not m:
3717                 self._downloader.trouble(u'Cannot find video title')
3718         title = clean_html(m.group('title'))
3719
3720         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3721         if m:
3722             desc = unescapeHTML(m.group('desc'))
3723         else:
3724             desc = None
3725
3726         info = {
3727             'id': video_id,
3728             'url': video_url,
3729             'ext': 'mp4',
3730             'title': title,
3731             'description': desc,
3732         }
3733         return [info]
3734
3735 class SteamIE(InfoExtractor):
3736     _VALID_URL = r"""http://store.steampowered.com/
3737                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3738                 (?P<gameID>\d+)/?
3739                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3740                 """
3741
3742     @classmethod
3743     def suitable(cls, url):
3744         """Receives a URL and returns True if suitable for this IE."""
3745         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3746
3747     def _real_extract(self, url):
3748         m = re.match(self._VALID_URL, url, re.VERBOSE)
3749         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3750         gameID = m.group('gameID')
3751         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3752         webpage = self._download_webpage(videourl, gameID)
3753         mweb = re.finditer(urlRE, webpage)
3754         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3755         titles = re.finditer(namesRE, webpage)
3756         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3757         thumbs = re.finditer(thumbsRE, webpage)
3758         videos = []
3759         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3760             video_id = vid.group('videoID')
3761             title = vtitle.group('videoName')
3762             video_url = vid.group('videoURL')
3763             video_thumb = thumb.group('thumbnail')
3764             if not video_url:
3765                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3766             info = {
3767                 'id':video_id,
3768                 'url':video_url,
3769                 'ext': 'flv',
3770                 'title': unescapeHTML(title),
3771                 'thumbnail': video_thumb
3772                   }
3773             videos.append(info)
3774         return videos
3775
3776 class UstreamIE(InfoExtractor):
3777     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3778     IE_NAME = u'ustream'
3779
3780     def _real_extract(self, url):
3781         m = re.match(self._VALID_URL, url)
3782         video_id = m.group('videoID')
3783         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3784         webpage = self._download_webpage(url, video_id)
3785         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3786         title = m.group('title')
3787         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3788         uploader = m.group('uploader')
3789         info = {
3790                 'id':video_id,
3791                 'url':video_url,
3792                 'ext': 'flv',
3793                 'title': title,
3794                 'uploader': uploader
3795                   }
3796         return [info]
3797
3798 class WorldStarHipHopIE(InfoExtractor):
3799     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3800     IE_NAME = u'WorldStarHipHop'
3801
3802     def _real_extract(self, url):
3803         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3804
3805         webpage_src = compat_urllib_request.urlopen(url).read()
3806         webpage_src = webpage_src.decode('utf-8')
3807
3808         mobj = re.search(_src_url, webpage_src)
3809
3810         m = re.match(self._VALID_URL, url)
3811         video_id = m.group('id')
3812
3813         if mobj is not None:
3814             video_url = mobj.group()
3815             if 'mp4' in video_url:
3816                 ext = 'mp4'
3817             else:
3818                 ext = 'flv'
3819         else:
3820             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3821             return
3822
3823         _title = r"""<title>(.*)</title>"""
3824
3825         mobj = re.search(_title, webpage_src)
3826
3827         if mobj is not None:
3828             title = mobj.group(1)
3829         else:
3830             title = 'World Start Hip Hop - %s' % time.ctime()
3831
3832         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3833         mobj = re.search(_thumbnail, webpage_src)
3834
3835         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3836         if mobj is not None:
3837             thumbnail = mobj.group(1)
3838         else:
3839             _title = r"""candytitles.*>(.*)</span>"""
3840             mobj = re.search(_title, webpage_src)
3841             if mobj is not None:
3842                 title = mobj.group(1)
3843             thumbnail = None
3844
3845         results = [{
3846                     'id': video_id,
3847                     'url' : video_url,
3848                     'title' : title,
3849                     'thumbnail' : thumbnail,
3850                     'ext' : ext,
3851                     }]
3852         return results
3853
3854 class RBMARadioIE(InfoExtractor):
3855     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3856
3857     def _real_extract(self, url):
3858         m = re.match(self._VALID_URL, url)
3859         video_id = m.group('videoID')
3860
3861         webpage = self._download_webpage(url, video_id)
3862         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3863         if not m:
3864             raise ExtractorError(u'Cannot find metadata')
3865         json_data = m.group(1)
3866
3867         try:
3868             data = json.loads(json_data)
3869         except ValueError as e:
3870             raise ExtractorError(u'Invalid JSON: ' + str(e))
3871
3872         video_url = data['akamai_url'] + '&cbr=256'
3873         url_parts = compat_urllib_parse_urlparse(video_url)
3874         video_ext = url_parts.path.rpartition('.')[2]
3875         info = {
3876                 'id': video_id,
3877                 'url': video_url,
3878                 'ext': video_ext,
3879                 'title': data['title'],
3880                 'description': data.get('teaser_text'),
3881                 'location': data.get('country_of_origin'),
3882                 'uploader': data.get('host', {}).get('name'),
3883                 'uploader_id': data.get('host', {}).get('slug'),
3884                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3885                 'duration': data.get('duration'),
3886         }
3887         return [info]
3888
3889
3890 class YouPornIE(InfoExtractor):
3891     """Information extractor for youporn.com."""
3892     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3893
3894     def _print_formats(self, formats):
3895         """Print all available formats"""
3896         print(u'Available formats:')
3897         print(u'ext\t\tformat')
3898         print(u'---------------------------------')
3899         for format in formats:
3900             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3901
3902     def _specific(self, req_format, formats):
3903         for x in formats:
3904             if(x["format"]==req_format):
3905                 return x
3906         return None
3907
3908     def _real_extract(self, url):
3909         mobj = re.match(self._VALID_URL, url)
3910         if mobj is None:
3911             self._downloader.report_error(u'invalid URL: %s' % url)
3912             return
3913
3914         video_id = mobj.group('videoid')
3915
3916         req = compat_urllib_request.Request(url)
3917         req.add_header('Cookie', 'age_verified=1')
3918         webpage = self._download_webpage(req, video_id)
3919
3920         # Get the video title
3921         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3922         if result is None:
3923             raise ExtractorError(u'Unable to extract video title')
3924         video_title = result.group('title').strip()
3925
3926         # Get the video date
3927         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3928         if result is None:
3929             self._downloader.report_warning(u'unable to extract video date')
3930             upload_date = None
3931         else:
3932             upload_date = result.group('date').strip()
3933
3934         # Get the video uploader
3935         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3936         if result is None:
3937             self._downloader.report_warning(u'unable to extract uploader')
3938             video_uploader = None
3939         else:
3940             video_uploader = result.group('uploader').strip()
3941             video_uploader = clean_html( video_uploader )
3942
3943         # Get all of the formats available
3944         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3945         result = re.search(DOWNLOAD_LIST_RE, webpage)
3946         if result is None:
3947             raise ExtractorError(u'Unable to extract download list')
3948         download_list_html = result.group('download_list').strip()
3949
3950         # Get all of the links from the page
3951         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3952         links = re.findall(LINK_RE, download_list_html)
3953         if(len(links) == 0):
3954             raise ExtractorError(u'ERROR: no known formats available for video')
3955
3956         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3957
3958         formats = []
3959         for link in links:
3960
3961             # A link looks like this:
3962             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3963             # A path looks like this:
3964             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3965             video_url = unescapeHTML( link )
3966             path = compat_urllib_parse_urlparse( video_url ).path
3967             extension = os.path.splitext( path )[1][1:]
3968             format = path.split('/')[4].split('_')[:2]
3969             size = format[0]
3970             bitrate = format[1]
3971             format = "-".join( format )
3972             title = u'%s-%s-%s' % (video_title, size, bitrate)
3973
3974             formats.append({
3975                 'id': video_id,
3976                 'url': video_url,
3977                 'uploader': video_uploader,
3978                 'upload_date': upload_date,
3979                 'title': title,
3980                 'ext': extension,
3981                 'format': format,
3982                 'thumbnail': None,
3983                 'description': None,
3984                 'player_url': None
3985             })
3986
3987         if self._downloader.params.get('listformats', None):
3988             self._print_formats(formats)
3989             return
3990
3991         req_format = self._downloader.params.get('format', None)
3992         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3993
3994         if req_format is None or req_format == 'best':
3995             return [formats[0]]
3996         elif req_format == 'worst':
3997             return [formats[-1]]
3998         elif req_format in ('-1', 'all'):
3999             return formats
4000         else:
4001             format = self._specific( req_format, formats )
4002             if result is None:
4003                 self._downloader.report_error(u'requested format not available')
4004                 return
4005             return [format]
4006
4007
4008
4009 class PornotubeIE(InfoExtractor):
4010     """Information extractor for pornotube.com."""
4011     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4012
4013     def _real_extract(self, url):
4014         mobj = re.match(self._VALID_URL, url)
4015         if mobj is None:
4016             self._downloader.report_error(u'invalid URL: %s' % url)
4017             return
4018
4019         video_id = mobj.group('videoid')
4020         video_title = mobj.group('title')
4021
4022         # Get webpage content
4023         webpage = self._download_webpage(url, video_id)
4024
4025         # Get the video URL
4026         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4027         result = re.search(VIDEO_URL_RE, webpage)
4028         if result is None:
4029             self._downloader.report_error(u'unable to extract video url')
4030             return
4031         video_url = compat_urllib_parse.unquote(result.group('url'))
4032
4033         #Get the uploaded date
4034         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4035         result = re.search(VIDEO_UPLOADED_RE, webpage)
4036         if result is None:
4037             self._downloader.report_error(u'unable to extract video title')
4038             return
4039         upload_date = result.group('date')
4040
4041         info = {'id': video_id,
4042                 'url': video_url,
4043                 'uploader': None,
4044                 'upload_date': upload_date,
4045                 'title': video_title,
4046                 'ext': 'flv',
4047                 'format': 'flv'}
4048
4049         return [info]
4050
4051 class YouJizzIE(InfoExtractor):
4052     """Information extractor for youjizz.com."""
4053     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4054
4055     def _real_extract(self, url):
4056         mobj = re.match(self._VALID_URL, url)
4057         if mobj is None:
4058             self._downloader.report_error(u'invalid URL: %s' % url)
4059             return
4060
4061         video_id = mobj.group('videoid')
4062
4063         # Get webpage content
4064         webpage = self._download_webpage(url, video_id)
4065
4066         # Get the video title
4067         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4068         if result is None:
4069             raise ExtractorError(u'ERROR: unable to extract video title')
4070         video_title = result.group('title').strip()
4071
4072         # Get the embed page
4073         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4074         if result is None:
4075             raise ExtractorError(u'ERROR: unable to extract embed page')
4076
4077         embed_page_url = result.group(0).strip()
4078         video_id = result.group('videoid')
4079
4080         webpage = self._download_webpage(embed_page_url, video_id)
4081
4082         # Get the video URL
4083         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4084         if result is None:
4085             raise ExtractorError(u'ERROR: unable to extract video url')
4086         video_url = result.group('source')
4087
4088         info = {'id': video_id,
4089                 'url': video_url,
4090                 'title': video_title,
4091                 'ext': 'flv',
4092                 'format': 'flv',
4093                 'player_url': embed_page_url}
4094
4095         return [info]
4096
4097 class EightTracksIE(InfoExtractor):
4098     IE_NAME = '8tracks'
4099     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4100
4101     def _real_extract(self, url):
4102         mobj = re.match(self._VALID_URL, url)
4103         if mobj is None:
4104             raise ExtractorError(u'Invalid URL: %s' % url)
4105         playlist_id = mobj.group('id')
4106
4107         webpage = self._download_webpage(url, playlist_id)
4108
4109         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4110         if not m:
4111             raise ExtractorError(u'Cannot find trax information')
4112         json_like = m.group(1)
4113         data = json.loads(json_like)
4114
4115         session = str(random.randint(0, 1000000000))
4116         mix_id = data['id']
4117         track_count = data['tracks_count']
4118         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4119         next_url = first_url
4120         res = []
4121         for i in itertools.count():
4122             api_json = self._download_webpage(next_url, playlist_id,
4123                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4124                 errnote=u'Failed to download song information')
4125             api_data = json.loads(api_json)
4126             track_data = api_data[u'set']['track']
4127             info = {
4128                 'id': track_data['id'],
4129                 'url': track_data['track_file_stream_url'],
4130                 'title': track_data['performer'] + u' - ' + track_data['name'],
4131                 'raw_title': track_data['name'],
4132                 'uploader_id': data['user']['login'],
4133                 'ext': 'm4a',
4134             }
4135             res.append(info)
4136             if api_data['set']['at_last_track']:
4137                 break
4138             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4139         return res
4140
4141 class KeekIE(InfoExtractor):
4142     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4143     IE_NAME = u'keek'
4144
4145     def _real_extract(self, url):
4146         m = re.match(self._VALID_URL, url)
4147         video_id = m.group('videoID')
4148         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4149         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4150         webpage = self._download_webpage(url, video_id)
4151         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4152         title = unescapeHTML(m.group('title'))
4153         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4154         uploader = clean_html(m.group('uploader'))
4155         info = {
4156                 'id': video_id,
4157                 'url': video_url,
4158                 'ext': 'mp4',
4159                 'title': title,
4160                 'thumbnail': thumbnail,
4161                 'uploader': uploader
4162         }
4163         return [info]
4164
4165 class TEDIE(InfoExtractor):
4166     _VALID_URL=r'''http://www.ted.com/
4167                    (
4168                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4169                         |
4170                         ((?P<type_talk>talks)) # We have a simple talk
4171                    )
4172                    /(?P<name>\w+) # Here goes the name and then ".html"
4173                    '''
4174
4175     @classmethod
4176     def suitable(cls, url):
4177         """Receives a URL and returns True if suitable for this IE."""
4178         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4179
4180     def _real_extract(self, url):
4181         m=re.match(self._VALID_URL, url, re.VERBOSE)
4182         if m.group('type_talk'):
4183             return [self._talk_info(url)]
4184         else :
4185             playlist_id=m.group('playlist_id')
4186             name=m.group('name')
4187             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4188             return self._playlist_videos_info(url,name,playlist_id)
4189
4190     def _talk_video_link(self,mediaSlug):
4191         '''Returns the video link for that mediaSlug'''
4192         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4193
4194     def _playlist_videos_info(self,url,name,playlist_id=0):
4195         '''Returns the videos of the playlist'''
4196         video_RE=r'''
4197                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4198                      ([.\s]*?)data-playlist_item_id="(\d+)"
4199                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4200                      '''
4201         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4202         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4203         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4204         m_names=re.finditer(video_name_RE,webpage)
4205         info=[]
4206         for m_video, m_name in zip(m_videos,m_names):
4207             video_id=m_video.group('video_id')
4208             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4209             info.append(self._talk_info(talk_url,video_id))
4210         return info
4211
4212     def _talk_info(self, url, video_id=0):
4213         """Return the video for the talk in the url"""
4214         m=re.match(self._VALID_URL, url,re.VERBOSE)
4215         videoName=m.group('name')
4216         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4217         # If the url includes the language we get the title translated
4218         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4219         title=re.search(title_RE, webpage).group('title')
4220         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4221                         "id":(?P<videoID>[\d]+).*?
4222                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4223         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4224         thumb_match=re.search(thumb_RE,webpage)
4225         info_match=re.search(info_RE,webpage,re.VERBOSE)
4226         video_id=info_match.group('videoID')
4227         mediaSlug=info_match.group('mediaSlug')
4228         video_url=self._talk_video_link(mediaSlug)
4229         info = {
4230                 'id': video_id,
4231                 'url': video_url,
4232                 'ext': 'mp4',
4233                 'title': title,
4234                 'thumbnail': thumb_match.group('thumbnail')
4235                 }
4236         return info
4237
4238 class MySpassIE(InfoExtractor):
4239     _VALID_URL = r'http://www.myspass.de/.*'
4240
4241     def _real_extract(self, url):
4242         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4243
4244         # video id is the last path element of the URL
4245         # usually there is a trailing slash, so also try the second but last
4246         url_path = compat_urllib_parse_urlparse(url).path
4247         url_parent_path, video_id = os.path.split(url_path)
4248         if not video_id:
4249             _, video_id = os.path.split(url_parent_path)
4250
4251         # get metadata
4252         metadata_url = META_DATA_URL_TEMPLATE % video_id
4253         metadata_text = self._download_webpage(metadata_url, video_id)
4254         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4255
4256         # extract values from metadata
4257         url_flv_el = metadata.find('url_flv')
4258         if url_flv_el is None:
4259             self._downloader.report_error(u'unable to extract download url')
4260             return
4261         video_url = url_flv_el.text
4262         extension = os.path.splitext(video_url)[1][1:]
4263         title_el = metadata.find('title')
4264         if title_el is None:
4265             self._downloader.report_error(u'unable to extract title')
4266             return
4267         title = title_el.text
4268         format_id_el = metadata.find('format_id')
4269         if format_id_el is None:
4270             format = ext
4271         else:
4272             format = format_id_el.text
4273         description_el = metadata.find('description')
4274         if description_el is not None:
4275             description = description_el.text
4276         else:
4277             description = None
4278         imagePreview_el = metadata.find('imagePreview')
4279         if imagePreview_el is not None:
4280             thumbnail = imagePreview_el.text
4281         else:
4282             thumbnail = None
4283         info = {
4284             'id': video_id,
4285             'url': video_url,
4286             'title': title,
4287             'ext': extension,
4288             'format': format,
4289             'thumbnail': thumbnail,
4290             'description': description
4291         }
4292         return [info]
4293
4294 class SpiegelIE(InfoExtractor):
4295     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4296
4297     def _real_extract(self, url):
4298         m = re.match(self._VALID_URL, url)
4299         video_id = m.group('videoID')
4300
4301         webpage = self._download_webpage(url, video_id)
4302         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4303         if not m:
4304             raise ExtractorError(u'Cannot find title')
4305         video_title = unescapeHTML(m.group(1))
4306
4307         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4308         xml_code = self._download_webpage(xml_url, video_id,
4309                     note=u'Downloading XML', errnote=u'Failed to download XML')
4310
4311         idoc = xml.etree.ElementTree.fromstring(xml_code)
4312         last_type = idoc[-1]
4313         filename = last_type.findall('./filename')[0].text
4314         duration = float(last_type.findall('./duration')[0].text)
4315
4316         video_url = 'http://video2.spiegel.de/flash/' + filename
4317         video_ext = filename.rpartition('.')[2]
4318         info = {
4319             'id': video_id,
4320             'url': video_url,
4321             'ext': video_ext,
4322             'title': video_title,
4323             'duration': duration,
4324         }
4325         return [info]
4326
4327 class LiveLeakIE(InfoExtractor):
4328
4329     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4330     IE_NAME = u'liveleak'
4331
4332     def _real_extract(self, url):
4333         mobj = re.match(self._VALID_URL, url)
4334         if mobj is None:
4335             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4336             return
4337
4338         video_id = mobj.group('video_id')
4339
4340         webpage = self._download_webpage(url, video_id)
4341
4342         m = re.search(r'file: "(.*?)",', webpage)
4343         if not m:
4344             self._downloader.report_error(u'unable to find video url')
4345             return
4346         video_url = m.group(1)
4347
4348         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4349         if not m:
4350             self._downloader.trouble(u'Cannot find video title')
4351         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4352
4353         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4354         if m:
4355             desc = unescapeHTML(m.group('desc'))
4356         else:
4357             desc = None
4358
4359         m = re.search(r'By:.*?(\w+)</a>', webpage)
4360         if m:
4361             uploader = clean_html(m.group(1))
4362         else:
4363             uploader = None
4364
4365         info = {
4366             'id':  video_id,
4367             'url': video_url,
4368             'ext': 'mp4',
4369             'title': title,
4370             'description': desc,
4371             'uploader': uploader
4372         }
4373
4374         return [info]
4375
4376 class ARDIE(InfoExtractor):
4377     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4378     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4379     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4380
4381     def _real_extract(self, url):
4382         # determine video id from url
4383         m = re.match(self._VALID_URL, url)
4384
4385         numid = re.search(r'documentId=([0-9]+)', url)
4386         if numid:
4387             video_id = numid.group(1)
4388         else:
4389             video_id = m.group('video_id')
4390
4391         # determine title and media streams from webpage
4392         html = self._download_webpage(url, video_id)
4393         title = re.search(self._TITLE, html).group('title')
4394         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4395         if not streams:
4396             assert '"fsk"' in html
4397             self._downloader.report_error(u'this video is only available after 8:00 pm')
4398             return
4399
4400         # choose default media type and highest quality for now
4401         stream = max([s for s in streams if int(s["media_type"]) == 0],
4402                      key=lambda s: int(s["quality"]))
4403
4404         # there's two possibilities: RTMP stream or HTTP download
4405         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4406         if stream['rtmp_url']:
4407             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4408             assert stream['video_url'].startswith('mp4:')
4409             info["url"] = stream["rtmp_url"]
4410             info["play_path"] = stream['video_url']
4411         else:
4412             assert stream["video_url"].endswith('.mp4')
4413             info["url"] = stream["video_url"]
4414         return [info]
4415
4416
4417 def gen_extractors():
4418     """ Return a list of an instance of every supported extractor.
4419     The order does matter; the first extractor matched is the one handling the URL.
4420     """
4421     return [
4422         YoutubePlaylistIE(),
4423         YoutubeChannelIE(),
4424         YoutubeUserIE(),
4425         YoutubeSearchIE(),
4426         YoutubeIE(),
4427         MetacafeIE(),
4428         DailymotionIE(),
4429         GoogleSearchIE(),
4430         PhotobucketIE(),
4431         YahooIE(),
4432         YahooSearchIE(),
4433         DepositFilesIE(),
4434         FacebookIE(),
4435         BlipTVUserIE(),
4436         BlipTVIE(),
4437         VimeoIE(),
4438         MyVideoIE(),
4439         ComedyCentralIE(),
4440         EscapistIE(),
4441         CollegeHumorIE(),
4442         XVideosIE(),
4443         SoundcloudSetIE(),
4444         SoundcloudIE(),
4445         InfoQIE(),
4446         MixcloudIE(),
4447         StanfordOpenClassroomIE(),
4448         MTVIE(),
4449         YoukuIE(),
4450         XNXXIE(),
4451         YouJizzIE(),
4452         PornotubeIE(),
4453         YouPornIE(),
4454         GooglePlusIE(),
4455         ArteTvIE(),
4456         NBAIE(),
4457         WorldStarHipHopIE(),
4458         JustinTVIE(),
4459         FunnyOrDieIE(),
4460         SteamIE(),
4461         UstreamIE(),
4462         RBMARadioIE(),
4463         EightTracksIE(),
4464         KeekIE(),
4465         TEDIE(),
4466         MySpassIE(),
4467         SpiegelIE(),
4468         LiveLeakIE(),
4469         ARDIE(),
4470         GenericIE()
4471     ]