Merge branch 'master' into extract_info_rewrite
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137         
138     #Methods for following #608
139     #They set the correct value of the '_type' key
140     def video_result(self, video_info):
141         """Returns a video"""
142         video_info['_type'] = 'video'
143         return video_info
144     def url_result(self, url, ie=None):
145         """Returns a url that points to a page that should be processed"""
146         #TODO: ie should be the class used for getting the info
147         video_info = {'_type': 'url',
148                       'url': url}
149         return video_info
150     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151         """Returns a playlist"""
152         video_info = {'_type': 'playlist',
153                       'entries': entries}
154         if playlist_id:
155             video_info['id'] = playlist_id
156         if playlist_title:
157             video_info['title'] = playlist_title
158         return video_info
159
160
161 class YoutubeIE(InfoExtractor):
162     """Information extractor for youtube.com."""
163
164     _VALID_URL = r"""^
165                      (
166                          (?:https?://)?                                       # http(s):// (optional)
167                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
169                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
170                          (?:                                                  # the various things that can precede the ID:
171                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
172                              |(?:                                             # or the v= param in all its forms
173                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
175                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
176                                  v=
177                              )
178                          )?                                                   # optional -> youtube.com/xxxx is OK
179                      )?                                                       # all until now is optional -> you can pass the naked ID
180                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
181                      (?(1).+)?                                                # if we found the ID, everything can follow
182                      $"""
183     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187     _NETRC_MACHINE = 'youtube'
188     # Listed in order of quality
189     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191     _video_extensions = {
192         '13': '3gp',
193         '17': 'mp4',
194         '18': 'mp4',
195         '22': 'mp4',
196         '37': 'mp4',
197         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
198         '43': 'webm',
199         '44': 'webm',
200         '45': 'webm',
201         '46': 'webm',
202     }
203     _video_dimensions = {
204         '5': '240x400',
205         '6': '???',
206         '13': '???',
207         '17': '144x176',
208         '18': '360x640',
209         '22': '720x1280',
210         '34': '360x640',
211         '35': '480x854',
212         '37': '1080x1920',
213         '38': '3072x4096',
214         '43': '360x640',
215         '44': '480x854',
216         '45': '720x1280',
217         '46': '1080x1920',
218     }
219     IE_NAME = u'youtube'
220
221     @classmethod
222     def suitable(cls, url):
223         """Receives a URL and returns True if suitable for this IE."""
224         if YoutubePlaylistIE.suitable(url): return False
225         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
226
227     def report_lang(self):
228         """Report attempt to set language."""
229         self._downloader.to_screen(u'[youtube] Setting language')
230
231     def report_login(self):
232         """Report attempt to log in."""
233         self._downloader.to_screen(u'[youtube] Logging in')
234
235     def report_age_confirmation(self):
236         """Report attempt to confirm age."""
237         self._downloader.to_screen(u'[youtube] Confirming age')
238
239     def report_video_webpage_download(self, video_id):
240         """Report attempt to download video webpage."""
241         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
242
243     def report_video_info_webpage_download(self, video_id):
244         """Report attempt to download video info webpage."""
245         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
246
247     def report_video_subtitles_download(self, video_id):
248         """Report attempt to download video info webpage."""
249         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
250
251     def report_video_subtitles_request(self, video_id, sub_lang, format):
252         """Report attempt to download video info webpage."""
253         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
254
255     def report_video_subtitles_available(self, video_id, sub_lang_list):
256         """Report available subtitles."""
257         sub_lang = ",".join(list(sub_lang_list.keys()))
258         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
259
260     def report_information_extraction(self, video_id):
261         """Report attempt to extract video information."""
262         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
263
264     def report_unavailable_format(self, video_id, format):
265         """Report extracted video URL."""
266         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
267
268     def report_rtmp_download(self):
269         """Indicate the download will use the RTMP protocol."""
270         self._downloader.to_screen(u'[youtube] RTMP download detected')
271
272     def _get_available_subtitles(self, video_id):
273         self.report_video_subtitles_download(video_id)
274         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
275         try:
276             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278             return (u'unable to download video subtitles: %s' % compat_str(err), None)
279         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281         if not sub_lang_list:
282             return (u'video doesn\'t have subtitles', None)
283         return sub_lang_list
284
285     def _list_available_subtitles(self, video_id):
286         sub_lang_list = self._get_available_subtitles(video_id)
287         self.report_video_subtitles_available(video_id, sub_lang_list)
288
289     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
290         """
291         Return tuple:
292         (error_message, sub_lang, sub)
293         """
294         self.report_video_subtitles_request(video_id, sub_lang, format)
295         params = compat_urllib_parse.urlencode({
296             'lang': sub_lang,
297             'name': sub_name,
298             'v': video_id,
299             'fmt': format,
300         })
301         url = 'http://www.youtube.com/api/timedtext?' + params
302         try:
303             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
306         if not sub:
307             return (u'Did not fetch video subtitles', None, None)
308         return (None, sub_lang, sub)
309
310     def _extract_subtitle(self, video_id):
311         """
312         Return a list with a tuple:
313         [(error_message, sub_lang, sub)]
314         """
315         sub_lang_list = self._get_available_subtitles(video_id)
316         sub_format = self._downloader.params.get('subtitlesformat')
317         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
318             return [(sub_lang_list[0], None, None)]
319         if self._downloader.params.get('subtitleslang', False):
320             sub_lang = self._downloader.params.get('subtitleslang')
321         elif 'en' in sub_lang_list:
322             sub_lang = 'en'
323         else:
324             sub_lang = list(sub_lang_list.keys())[0]
325         if not sub_lang in sub_lang_list:
326             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
327
328         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
329         return [subtitle]
330
331     def _extract_all_subtitles(self, video_id):
332         sub_lang_list = self._get_available_subtitles(video_id)
333         sub_format = self._downloader.params.get('subtitlesformat')
334         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
335             return [(sub_lang_list[0], None, None)]
336         subtitles = []
337         for sub_lang in sub_lang_list:
338             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339             subtitles.append(subtitle)
340         return subtitles
341
342     def _print_formats(self, formats):
343         print('Available formats:')
344         for x in formats:
345             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
346
347     def _real_initialize(self):
348         if self._downloader is None:
349             return
350
351         username = None
352         password = None
353         downloader_params = self._downloader.params
354
355         # Attempt to use provided username and password or .netrc data
356         if downloader_params.get('username', None) is not None:
357             username = downloader_params['username']
358             password = downloader_params['password']
359         elif downloader_params.get('usenetrc', False):
360             try:
361                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
362                 if info is not None:
363                     username = info[0]
364                     password = info[2]
365                 else:
366                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367             except (IOError, netrc.NetrcParseError) as err:
368                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
369                 return
370
371         # Set language
372         request = compat_urllib_request.Request(self._LANG_URL)
373         try:
374             self.report_lang()
375             compat_urllib_request.urlopen(request).read()
376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
378             return
379
380         # No authentication to be performed
381         if username is None:
382             return
383
384         request = compat_urllib_request.Request(self._LOGIN_URL)
385         try:
386             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
389             return
390
391         galx = None
392         dsh = None
393         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
394         if match:
395           galx = match.group(1)
396
397         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
398         if match:
399           dsh = match.group(1)
400
401         # Log in
402         login_form_strs = {
403                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
404                 u'Email': username,
405                 u'GALX': galx,
406                 u'Passwd': password,
407                 u'PersistentCookie': u'yes',
408                 u'_utf8': u'霱',
409                 u'bgresponse': u'js_disabled',
410                 u'checkConnection': u'',
411                 u'checkedDomains': u'youtube',
412                 u'dnConn': u'',
413                 u'dsh': dsh,
414                 u'pstMsg': u'0',
415                 u'rmShown': u'1',
416                 u'secTok': u'',
417                 u'signIn': u'Sign in',
418                 u'timeStmp': u'',
419                 u'service': u'youtube',
420                 u'uilel': u'3',
421                 u'hl': u'en_US',
422         }
423         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
424         # chokes on unicode
425         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
426         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
427         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
428         try:
429             self.report_login()
430             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
432                 self._downloader.report_warning(u'unable to log in: bad username or password')
433                 return
434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
436             return
437
438         # Confirm age
439         age_form = {
440                 'next_url':     '/',
441                 'action_confirm':   'Confirm',
442                 }
443         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
444         try:
445             self.report_age_confirmation()
446             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
448             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
449             return
450
451     def _extract_id(self, url):
452         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
453         if mobj is None:
454             self._downloader.report_error(u'invalid URL: %s' % url)
455             return
456         video_id = mobj.group(2)
457         return video_id
458
459     def _real_extract(self, url):
460         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
461         mobj = re.search(self._NEXT_URL_RE, url)
462         if mobj:
463             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
464         video_id = self._extract_id(url)
465
466         # Get video webpage
467         self.report_video_webpage_download(video_id)
468         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
469         request = compat_urllib_request.Request(url)
470         try:
471             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
473             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
474             return
475
476         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
477
478         # Attempt to extract SWF player URL
479         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
480         if mobj is not None:
481             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
482         else:
483             player_url = None
484
485         # Get video info
486         self.report_video_info_webpage_download(video_id)
487         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
488             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
489                     % (video_id, el_type))
490             request = compat_urllib_request.Request(video_info_url)
491             try:
492                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
493                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
494                 video_info = compat_parse_qs(video_info_webpage)
495                 if 'token' in video_info:
496                     break
497             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
499                 return
500         if 'token' not in video_info:
501             if 'reason' in video_info:
502                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
503             else:
504                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
505             return
506
507         # Check for "rental" videos
508         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
509             self._downloader.report_error(u'"rental" videos not supported')
510             return
511
512         # Start extracting information
513         self.report_information_extraction(video_id)
514
515         # uploader
516         if 'author' not in video_info:
517             self._downloader.report_error(u'unable to extract uploader name')
518             return
519         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
520
521         # uploader_id
522         video_uploader_id = None
523         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
524         if mobj is not None:
525             video_uploader_id = mobj.group(1)
526         else:
527             self._downloader.report_warning(u'unable to extract uploader nickname')
528
529         # title
530         if 'title' not in video_info:
531             self._downloader.report_error(u'unable to extract video title')
532             return
533         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
534
535         # thumbnail image
536         if 'thumbnail_url' not in video_info:
537             self._downloader.report_warning(u'unable to extract video thumbnail')
538             video_thumbnail = ''
539         else:   # don't panic if we can't find it
540             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
541
542         # upload date
543         upload_date = None
544         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
545         if mobj is not None:
546             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
547             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
548             for expression in format_expressions:
549                 try:
550                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
551                 except:
552                     pass
553
554         # description
555         video_description = get_element_by_id("eow-description", video_webpage)
556         if video_description:
557             video_description = clean_html(video_description)
558         else:
559             video_description = ''
560
561         # subtitles
562         video_subtitles = None
563
564         if self._downloader.params.get('writesubtitles', False):
565             video_subtitles = self._extract_subtitle(video_id)
566             if video_subtitles:
567                 (sub_error, sub_lang, sub) = video_subtitles[0]
568                 if sub_error:
569                     self._downloader.report_error(sub_error)
570
571         if self._downloader.params.get('allsubtitles', False):
572             video_subtitles = self._extract_all_subtitles(video_id)
573             for video_subtitle in video_subtitles:
574                 (sub_error, sub_lang, sub) = video_subtitle
575                 if sub_error:
576                     self._downloader.report_error(sub_error)
577
578         if self._downloader.params.get('listsubtitles', False):
579             sub_lang_list = self._list_available_subtitles(video_id)
580             return
581
582         if 'length_seconds' not in video_info:
583             self._downloader.report_warning(u'unable to extract video duration')
584             video_duration = ''
585         else:
586             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
587
588         # token
589         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
590
591         # Decide which formats to download
592         req_format = self._downloader.params.get('format', None)
593
594         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
595             self.report_rtmp_download()
596             video_url_list = [(None, video_info['conn'][0])]
597         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
598             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
599             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
600             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
601             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
602
603             format_limit = self._downloader.params.get('format_limit', None)
604             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
605             if format_limit is not None and format_limit in available_formats:
606                 format_list = available_formats[available_formats.index(format_limit):]
607             else:
608                 format_list = available_formats
609             existing_formats = [x for x in format_list if x in url_map]
610             if len(existing_formats) == 0:
611                 self._downloader.report_error(u'no known formats available for video')
612                 return
613             if self._downloader.params.get('listformats', None):
614                 self._print_formats(existing_formats)
615                 return
616             if req_format is None or req_format == 'best':
617                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
618             elif req_format == 'worst':
619                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
620             elif req_format in ('-1', 'all'):
621                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
622             else:
623                 # Specific formats. We pick the first in a slash-delimeted sequence.
624                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
625                 req_formats = req_format.split('/')
626                 video_url_list = None
627                 for rf in req_formats:
628                     if rf in url_map:
629                         video_url_list = [(rf, url_map[rf])]
630                         break
631                 if video_url_list is None:
632                     self._downloader.report_error(u'requested format not available')
633                     return
634         else:
635             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
636             return
637
638         results = []
639         for format_param, video_real_url in video_url_list:
640             # Extension
641             video_extension = self._video_extensions.get(format_param, 'flv')
642
643             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
644                                               self._video_dimensions.get(format_param, '???'))
645
646             results.append({
647                 'id':       video_id,
648                 'url':      video_real_url,
649                 'uploader': video_uploader,
650                 'uploader_id': video_uploader_id,
651                 'upload_date':  upload_date,
652                 'title':    video_title,
653                 'ext':      video_extension,
654                 'format':   video_format,
655                 'thumbnail':    video_thumbnail,
656                 'description':  video_description,
657                 'player_url':   player_url,
658                 'subtitles':    video_subtitles,
659                 'duration':     video_duration
660             })
661         return results
662
663
664 class MetacafeIE(InfoExtractor):
665     """Information Extractor for metacafe.com."""
666
667     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
668     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
669     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
670     IE_NAME = u'metacafe'
671
672     def __init__(self, downloader=None):
673         InfoExtractor.__init__(self, downloader)
674
675     def report_disclaimer(self):
676         """Report disclaimer retrieval."""
677         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
678
679     def report_age_confirmation(self):
680         """Report attempt to confirm age."""
681         self._downloader.to_screen(u'[metacafe] Confirming age')
682
683     def report_download_webpage(self, video_id):
684         """Report webpage download."""
685         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
686
687     def report_extraction(self, video_id):
688         """Report information extraction."""
689         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
690
691     def _real_initialize(self):
692         # Retrieve disclaimer
693         request = compat_urllib_request.Request(self._DISCLAIMER)
694         try:
695             self.report_disclaimer()
696             disclaimer = compat_urllib_request.urlopen(request).read()
697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
699             return
700
701         # Confirm age
702         disclaimer_form = {
703             'filters': '0',
704             'submit': "Continue - I'm over 18",
705             }
706         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707         try:
708             self.report_age_confirmation()
709             disclaimer = compat_urllib_request.urlopen(request).read()
710         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
712             return
713
714     def _real_extract(self, url):
715         # Extract id and simplified title from URL
716         mobj = re.match(self._VALID_URL, url)
717         if mobj is None:
718             self._downloader.report_error(u'invalid URL: %s' % url)
719             return
720
721         video_id = mobj.group(1)
722
723         # Check if video comes from YouTube
724         mobj2 = re.match(r'^yt-(.*)$', video_id)
725         if mobj2 is not None:
726             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
727             return
728
729         # Retrieve video webpage to extract further information
730         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
731         try:
732             self.report_download_webpage(video_id)
733             webpage = compat_urllib_request.urlopen(request).read()
734         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
735             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
736             return
737
738         # Extract URL, uploader and title from webpage
739         self.report_extraction(video_id)
740         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741         if mobj is not None:
742             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
743             video_extension = mediaURL[-3:]
744
745             # Extract gdaKey if available
746             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747             if mobj is None:
748                 video_url = mediaURL
749             else:
750                 gdaKey = mobj.group(1)
751                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752         else:
753             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754             if mobj is None:
755                 self._downloader.report_error(u'unable to extract media URL')
756                 return
757             vardict = compat_parse_qs(mobj.group(1))
758             if 'mediaData' not in vardict:
759                 self._downloader.report_error(u'unable to extract media URL')
760                 return
761             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
762             if mobj is None:
763                 self._downloader.report_error(u'unable to extract media URL')
764                 return
765             mediaURL = mobj.group(1).replace('\\/', '/')
766             video_extension = mediaURL[-3:]
767             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
768
769         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770         if mobj is None:
771             self._downloader.report_error(u'unable to extract title')
772             return
773         video_title = mobj.group(1).decode('utf-8')
774
775         mobj = re.search(r'submitter=(.*?);', webpage)
776         if mobj is None:
777             self._downloader.report_error(u'unable to extract uploader nickname')
778             return
779         video_uploader = mobj.group(1)
780
781         return [{
782             'id':       video_id.decode('utf-8'),
783             'url':      video_url.decode('utf-8'),
784             'uploader': video_uploader.decode('utf-8'),
785             'upload_date':  None,
786             'title':    video_title,
787             'ext':      video_extension.decode('utf-8'),
788         }]
789
790
791 class DailymotionIE(InfoExtractor):
792     """Information Extractor for Dailymotion"""
793
794     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
795     IE_NAME = u'dailymotion'
796     _WORKING = False
797
798     def __init__(self, downloader=None):
799         InfoExtractor.__init__(self, downloader)
800
801     def report_extraction(self, video_id):
802         """Report information extraction."""
803         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804
805     def _real_extract(self, url):
806         # Extract id and simplified title from URL
807         mobj = re.match(self._VALID_URL, url)
808         if mobj is None:
809             self._downloader.report_error(u'invalid URL: %s' % url)
810             return
811
812         video_id = mobj.group(1).split('_')[0].split('?')[0]
813
814         video_extension = 'mp4'
815
816         # Retrieve video webpage to extract further information
817         request = compat_urllib_request.Request(url)
818         request.add_header('Cookie', 'family_filter=off')
819         webpage = self._download_webpage(request, video_id)
820
821         # Extract URL, uploader and title from webpage
822         self.report_extraction(video_id)
823         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824         if mobj is None:
825             self._downloader.report_error(u'unable to extract media URL')
826             return
827         flashvars = compat_urllib_parse.unquote(mobj.group(1))
828
829         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
830             if key in flashvars:
831                 max_quality = key
832                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
833                 break
834         else:
835             self._downloader.report_error(u'unable to extract video URL')
836             return
837
838         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839         if mobj is None:
840             self._downloader.report_error(u'unable to extract video URL')
841             return
842
843         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844
845         # TODO: support choosing qualities
846
847         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848         if mobj is None:
849             self._downloader.report_error(u'unable to extract title')
850             return
851         video_title = unescapeHTML(mobj.group('title'))
852
853         video_uploader = None
854         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855         if mobj is None:
856             # lookin for official user
857             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
858             if mobj_official is None:
859                 self._downloader.report_warning(u'unable to extract uploader nickname')
860             else:
861                 video_uploader = mobj_official.group(1)
862         else:
863             video_uploader = mobj.group(1)
864
865         video_upload_date = None
866         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867         if mobj is not None:
868             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
869
870         return [{
871             'id':       video_id,
872             'url':      video_url,
873             'uploader': video_uploader,
874             'upload_date':  video_upload_date,
875             'title':    video_title,
876             'ext':      video_extension,
877         }]
878
879
880 class PhotobucketIE(InfoExtractor):
881     """Information extractor for photobucket.com."""
882
883     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
884     IE_NAME = u'photobucket'
885
886     def __init__(self, downloader=None):
887         InfoExtractor.__init__(self, downloader)
888
889     def report_download_webpage(self, video_id):
890         """Report webpage download."""
891         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892
893     def report_extraction(self, video_id):
894         """Report information extraction."""
895         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896
897     def _real_extract(self, url):
898         # Extract id from URL
899         mobj = re.match(self._VALID_URL, url)
900         if mobj is None:
901             self._downloader.report_error(u'Invalid URL: %s' % url)
902             return
903
904         video_id = mobj.group(1)
905
906         video_extension = 'flv'
907
908         # Retrieve video webpage to extract further information
909         request = compat_urllib_request.Request(url)
910         try:
911             self.report_download_webpage(video_id)
912             webpage = compat_urllib_request.urlopen(request).read()
913         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
914             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
915             return
916
917         # Extract URL, uploader, and title from webpage
918         self.report_extraction(video_id)
919         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920         if mobj is None:
921             self._downloader.report_error(u'unable to extract media URL')
922             return
923         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
924
925         video_url = mediaURL
926
927         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928         if mobj is None:
929             self._downloader.report_error(u'unable to extract title')
930             return
931         video_title = mobj.group(1).decode('utf-8')
932
933         video_uploader = mobj.group(2).decode('utf-8')
934
935         return [{
936             'id':       video_id.decode('utf-8'),
937             'url':      video_url.decode('utf-8'),
938             'uploader': video_uploader,
939             'upload_date':  None,
940             'title':    video_title,
941             'ext':      video_extension.decode('utf-8'),
942         }]
943
944
945 class YahooIE(InfoExtractor):
946     """Information extractor for video.yahoo.com."""
947
948     _WORKING = False
949     # _VALID_URL matches all Yahoo! Video URLs
950     # _VPAGE_URL matches only the extractable '/watch/' URLs
951     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
952     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
953     IE_NAME = u'video.yahoo'
954
955     def __init__(self, downloader=None):
956         InfoExtractor.__init__(self, downloader)
957
958     def report_download_webpage(self, video_id):
959         """Report webpage download."""
960         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961
962     def report_extraction(self, video_id):
963         """Report information extraction."""
964         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965
966     def _real_extract(self, url, new_video=True):
967         # Extract ID from URL
968         mobj = re.match(self._VALID_URL, url)
969         if mobj is None:
970             self._downloader.report_error(u'Invalid URL: %s' % url)
971             return
972
973         video_id = mobj.group(2)
974         video_extension = 'flv'
975
976         # Rewrite valid but non-extractable URLs as
977         # extractable English language /watch/ URLs
978         if re.match(self._VPAGE_URL, url) is None:
979             request = compat_urllib_request.Request(url)
980             try:
981                 webpage = compat_urllib_request.urlopen(request).read()
982             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
983                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
984                 return
985
986             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987             if mobj is None:
988                 self._downloader.report_error(u'Unable to extract id field')
989                 return
990             yahoo_id = mobj.group(1)
991
992             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993             if mobj is None:
994                 self._downloader.report_error(u'Unable to extract vid field')
995                 return
996             yahoo_vid = mobj.group(1)
997
998             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
999             return self._real_extract(url, new_video=False)
1000
1001         # Retrieve video webpage to extract further information
1002         request = compat_urllib_request.Request(url)
1003         try:
1004             self.report_download_webpage(video_id)
1005             webpage = compat_urllib_request.urlopen(request).read()
1006         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1007             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1008             return
1009
1010         # Extract uploader and title from webpage
1011         self.report_extraction(video_id)
1012         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013         if mobj is None:
1014             self._downloader.report_error(u'unable to extract video title')
1015             return
1016         video_title = mobj.group(1).decode('utf-8')
1017
1018         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019         if mobj is None:
1020             self._downloader.report_error(u'unable to extract video uploader')
1021             return
1022         video_uploader = mobj.group(1).decode('utf-8')
1023
1024         # Extract video thumbnail
1025         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026         if mobj is None:
1027             self._downloader.report_error(u'unable to extract video thumbnail')
1028             return
1029         video_thumbnail = mobj.group(1).decode('utf-8')
1030
1031         # Extract video description
1032         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033         if mobj is None:
1034             self._downloader.report_error(u'unable to extract video description')
1035             return
1036         video_description = mobj.group(1).decode('utf-8')
1037         if not video_description:
1038             video_description = 'No description available.'
1039
1040         # Extract video height and width
1041         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042         if mobj is None:
1043             self._downloader.report_error(u'unable to extract video height')
1044             return
1045         yv_video_height = mobj.group(1)
1046
1047         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048         if mobj is None:
1049             self._downloader.report_error(u'unable to extract video width')
1050             return
1051         yv_video_width = mobj.group(1)
1052
1053         # Retrieve video playlist to extract media URL
1054         # I'm not completely sure what all these options are, but we
1055         # seem to need most of them, otherwise the server sends a 401.
1056         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1057         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1058         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1059                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1060                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061         try:
1062             self.report_download_webpage(video_id)
1063             webpage = compat_urllib_request.urlopen(request).read()
1064         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1065             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1066             return
1067
1068         # Extract media URL from playlist XML
1069         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070         if mobj is None:
1071             self._downloader.report_error(u'Unable to extract media URL')
1072             return
1073         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1074         video_url = unescapeHTML(video_url)
1075
1076         return [{
1077             'id':       video_id.decode('utf-8'),
1078             'url':      video_url,
1079             'uploader': video_uploader,
1080             'upload_date':  None,
1081             'title':    video_title,
1082             'ext':      video_extension.decode('utf-8'),
1083             'thumbnail':    video_thumbnail.decode('utf-8'),
1084             'description':  video_description,
1085         }]
1086
1087
1088 class VimeoIE(InfoExtractor):
1089     """Information extractor for vimeo.com."""
1090
1091     # _VALID_URL matches Vimeo URLs
1092     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1093     IE_NAME = u'vimeo'
1094
1095     def __init__(self, downloader=None):
1096         InfoExtractor.__init__(self, downloader)
1097
1098     def report_download_webpage(self, video_id):
1099         """Report webpage download."""
1100         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101
1102     def report_extraction(self, video_id):
1103         """Report information extraction."""
1104         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105
1106     def _real_extract(self, url, new_video=True):
1107         # Extract ID from URL
1108         mobj = re.match(self._VALID_URL, url)
1109         if mobj is None:
1110             self._downloader.report_error(u'Invalid URL: %s' % url)
1111             return
1112
1113         video_id = mobj.group('id')
1114         if not mobj.group('proto'):
1115             url = 'https://' + url
1116         if mobj.group('direct_link'):
1117             url = 'https://vimeo.com/' + video_id
1118
1119         # Retrieve video webpage to extract further information
1120         request = compat_urllib_request.Request(url, None, std_headers)
1121         try:
1122             self.report_download_webpage(video_id)
1123             webpage_bytes = compat_urllib_request.urlopen(request).read()
1124             webpage = webpage_bytes.decode('utf-8')
1125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1126             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1127             return
1128
1129         # Now we begin extracting as much information as we can from what we
1130         # retrieved. First we extract the information common to all extractors,
1131         # and latter we extract those that are Vimeo specific.
1132         self.report_extraction(video_id)
1133
1134         # Extract the config JSON
1135         try:
1136             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1137             config = json.loads(config)
1138         except:
1139             self._downloader.report_error(u'unable to extract info section')
1140             return
1141
1142         # Extract title
1143         video_title = config["video"]["title"]
1144
1145         # Extract uploader and uploader_id
1146         video_uploader = config["video"]["owner"]["name"]
1147         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148
1149         # Extract video thumbnail
1150         video_thumbnail = config["video"]["thumbnail"]
1151
1152         # Extract video description
1153         video_description = get_element_by_attribute("itemprop", "description", webpage)
1154         if video_description: video_description = clean_html(video_description)
1155         else: video_description = ''
1156
1157         # Extract upload date
1158         video_upload_date = None
1159         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1160         if mobj is not None:
1161             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162
1163         # Vimeo specific: extract request signature and timestamp
1164         sig = config['request']['signature']
1165         timestamp = config['request']['timestamp']
1166
1167         # Vimeo specific: extract video codec and quality information
1168         # First consider quality, then codecs, then take everything
1169         # TODO bind to format param
1170         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1171         files = { 'hd': [], 'sd': [], 'other': []}
1172         for codec_name, codec_extension in codecs:
1173             if codec_name in config["video"]["files"]:
1174                 if 'hd' in config["video"]["files"][codec_name]:
1175                     files['hd'].append((codec_name, codec_extension, 'hd'))
1176                 elif 'sd' in config["video"]["files"][codec_name]:
1177                     files['sd'].append((codec_name, codec_extension, 'sd'))
1178                 else:
1179                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180
1181         for quality in ('hd', 'sd', 'other'):
1182             if len(files[quality]) > 0:
1183                 video_quality = files[quality][0][2]
1184                 video_codec = files[quality][0][0]
1185                 video_extension = files[quality][0][1]
1186                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1187                 break
1188         else:
1189             self._downloader.report_error(u'no known codec found')
1190             return
1191
1192         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1193                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1194
1195         return [{
1196             'id':       video_id,
1197             'url':      video_url,
1198             'uploader': video_uploader,
1199             'uploader_id': video_uploader_id,
1200             'upload_date':  video_upload_date,
1201             'title':    video_title,
1202             'ext':      video_extension,
1203             'thumbnail':    video_thumbnail,
1204             'description':  video_description,
1205         }]
1206
1207
1208 class ArteTvIE(InfoExtractor):
1209     """arte.tv information extractor."""
1210
1211     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1212     _LIVE_URL = r'index-[0-9]+\.html$'
1213
1214     IE_NAME = u'arte.tv'
1215
1216     def __init__(self, downloader=None):
1217         InfoExtractor.__init__(self, downloader)
1218
1219     def report_download_webpage(self, video_id):
1220         """Report webpage download."""
1221         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222
1223     def report_extraction(self, video_id):
1224         """Report information extraction."""
1225         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226
1227     def fetch_webpage(self, url):
1228         request = compat_urllib_request.Request(url)
1229         try:
1230             self.report_download_webpage(url)
1231             webpage = compat_urllib_request.urlopen(request).read()
1232         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1233             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234             return
1235         except ValueError as err:
1236             self._downloader.report_error(u'Invalid URL: %s' % url)
1237             return
1238         return webpage
1239
1240     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1241         page = self.fetch_webpage(url)
1242         mobj = re.search(regex, page, regexFlags)
1243         info = {}
1244
1245         if mobj is None:
1246             self._downloader.report_error(u'Invalid URL: %s' % url)
1247             return
1248
1249         for (i, key, err) in matchTuples:
1250             if mobj.group(i) is None:
1251                 self._downloader.trouble(err)
1252                 return
1253             else:
1254                 info[key] = mobj.group(i)
1255
1256         return info
1257
1258     def extractLiveStream(self, url):
1259         video_lang = url.split('/')[-4]
1260         info = self.grep_webpage(
1261             url,
1262             r'src="(.*?/videothek_js.*?\.js)',
1263             0,
1264             [
1265                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1266             ]
1267         )
1268         http_host = url.split('/')[2]
1269         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1270         info = self.grep_webpage(
1271             next_url,
1272             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1273                 '(http://.*?\.swf).*?' +
1274                 '(rtmp://.*?)\'',
1275             re.DOTALL,
1276             [
1277                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1278                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1279                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1280             ]
1281         )
1282         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283
1284     def extractPlus7Stream(self, url):
1285         video_lang = url.split('/')[-3]
1286         info = self.grep_webpage(
1287             url,
1288             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1289             0,
1290             [
1291                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1292             ]
1293         )
1294         next_url = compat_urllib_parse.unquote(info.get('url'))
1295         info = self.grep_webpage(
1296             next_url,
1297             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1298             0,
1299             [
1300                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1301             ]
1302         )
1303         next_url = compat_urllib_parse.unquote(info.get('url'))
1304
1305         info = self.grep_webpage(
1306             next_url,
1307             r'<video id="(.*?)".*?>.*?' +
1308                 '<name>(.*?)</name>.*?' +
1309                 '<dateVideo>(.*?)</dateVideo>.*?' +
1310                 '<url quality="hd">(.*?)</url>',
1311             re.DOTALL,
1312             [
1313                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1314                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1315                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1316                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1317             ]
1318         )
1319
1320         return {
1321             'id':           info.get('id'),
1322             'url':          compat_urllib_parse.unquote(info.get('url')),
1323             'uploader':     u'arte.tv',
1324             'upload_date':  info.get('date'),
1325             'title':        info.get('title').decode('utf-8'),
1326             'ext':          u'mp4',
1327             'format':       u'NA',
1328             'player_url':   None,
1329         }
1330
1331     def _real_extract(self, url):
1332         video_id = url.split('/')[-1]
1333         self.report_extraction(video_id)
1334
1335         if re.search(self._LIVE_URL, video_id) is not None:
1336             self.extractLiveStream(url)
1337             return
1338         else:
1339             info = self.extractPlus7Stream(url)
1340
1341         return [info]
1342
1343
1344 class GenericIE(InfoExtractor):
1345     """Generic last-resort information extractor."""
1346
1347     _VALID_URL = r'.*'
1348     IE_NAME = u'generic'
1349
1350     def __init__(self, downloader=None):
1351         InfoExtractor.__init__(self, downloader)
1352
1353     def report_download_webpage(self, video_id):
1354         """Report webpage download."""
1355         if not self._downloader.params.get('test', False):
1356             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1357         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358
1359     def report_extraction(self, video_id):
1360         """Report information extraction."""
1361         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362
1363     def report_following_redirect(self, new_url):
1364         """Report information extraction."""
1365         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366
1367     def _test_redirect(self, url):
1368         """Check if it is a redirect, like url shorteners, in case return the new url."""
1369         class HeadRequest(compat_urllib_request.Request):
1370             def get_method(self):
1371                 return "HEAD"
1372
1373         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374             """
1375             Subclass the HTTPRedirectHandler to make it use our
1376             HeadRequest also on the redirected URL
1377             """
1378             def redirect_request(self, req, fp, code, msg, headers, newurl):
1379                 if code in (301, 302, 303, 307):
1380                     newurl = newurl.replace(' ', '%20')
1381                     newheaders = dict((k,v) for k,v in req.headers.items()
1382                                       if k.lower() not in ("content-length", "content-type"))
1383                     return HeadRequest(newurl,
1384                                        headers=newheaders,
1385                                        origin_req_host=req.get_origin_req_host(),
1386                                        unverifiable=True)
1387                 else:
1388                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389
1390         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391             """
1392             Fallback to GET if HEAD is not allowed (405 HTTP error)
1393             """
1394             def http_error_405(self, req, fp, code, msg, headers):
1395                 fp.read()
1396                 fp.close()
1397
1398                 newheaders = dict((k,v) for k,v in req.headers.items()
1399                                   if k.lower() not in ("content-length", "content-type"))
1400                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401                                                  headers=newheaders,
1402                                                  origin_req_host=req.get_origin_req_host(),
1403                                                  unverifiable=True))
1404
1405         # Build our opener
1406         opener = compat_urllib_request.OpenerDirector()
1407         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1408                         HTTPMethodFallback, HEADRedirectHandler,
1409                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1410             opener.add_handler(handler())
1411
1412         response = opener.open(HeadRequest(url))
1413         new_url = response.geturl()
1414
1415         if url == new_url:
1416             return False
1417
1418         self.report_following_redirect(new_url)
1419         return new_url
1420
1421     def _real_extract(self, url):
1422         new_url = self._test_redirect(url)
1423         if new_url: return [self.url_result(new_url)]
1424
1425         video_id = url.split('/')[-1]
1426         try:
1427             webpage = self._download_webpage(url, video_id)
1428         except ValueError as err:
1429             # since this is the last-resort InfoExtractor, if
1430             # this error is thrown, it'll be thrown here
1431             self._downloader.report_error(u'Invalid URL: %s' % url)
1432             return
1433
1434         self.report_extraction(video_id)
1435         # Start with something easy: JW Player in SWFObject
1436         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437         if mobj is None:
1438             # Broaden the search a little bit
1439             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440         if mobj is None:
1441             # Broaden the search a little bit: JWPlayer JS loader
1442             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443         if mobj is None:
1444             self._downloader.report_error(u'Invalid URL: %s' % url)
1445             return
1446
1447         # It's possible that one of the regexes
1448         # matched, but returned an empty group:
1449         if mobj.group(1) is None:
1450             self._downloader.report_error(u'Invalid URL: %s' % url)
1451             return
1452
1453         video_url = compat_urllib_parse.unquote(mobj.group(1))
1454         video_id = os.path.basename(video_url)
1455
1456         # here's a fun little line of code for you:
1457         video_extension = os.path.splitext(video_id)[1][1:]
1458         video_id = os.path.splitext(video_id)[0]
1459
1460         # it's tempting to parse this further, but you would
1461         # have to take into account all the variations like
1462         #   Video Title - Site Name
1463         #   Site Name | Video Title
1464         #   Video Title - Tagline | Site Name
1465         # and so on and so forth; it's just not practical
1466         mobj = re.search(r'<title>(.*)</title>', webpage)
1467         if mobj is None:
1468             self._downloader.report_error(u'unable to extract title')
1469             return
1470         video_title = mobj.group(1)
1471
1472         # video uploader is domain name
1473         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474         if mobj is None:
1475             self._downloader.report_error(u'unable to extract title')
1476             return
1477         video_uploader = mobj.group(1)
1478
1479         return [{
1480             'id':       video_id,
1481             'url':      video_url,
1482             'uploader': video_uploader,
1483             'upload_date':  None,
1484             'title':    video_title,
1485             'ext':      video_extension,
1486         }]
1487
1488
1489 class YoutubeSearchIE(InfoExtractor):
1490     """Information Extractor for YouTube search queries."""
1491     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1492     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1493     _max_youtube_results = 1000
1494     IE_NAME = u'youtube:search'
1495
1496     def __init__(self, downloader=None):
1497         InfoExtractor.__init__(self, downloader)
1498
1499     def report_download_page(self, query, pagenum):
1500         """Report attempt to download search page with given number."""
1501         query = query.decode(preferredencoding())
1502         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503
1504     def _real_extract(self, query):
1505         mobj = re.match(self._VALID_URL, query)
1506         if mobj is None:
1507             self._downloader.report_error(u'invalid search query "%s"' % query)
1508             return
1509
1510         prefix, query = query.split(':')
1511         prefix = prefix[8:]
1512         query = query.encode('utf-8')
1513         if prefix == '':
1514             self._download_n_results(query, 1)
1515             return
1516         elif prefix == 'all':
1517             self._download_n_results(query, self._max_youtube_results)
1518             return
1519         else:
1520             try:
1521                 n = int(prefix)
1522                 if n <= 0:
1523                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524                     return
1525                 elif n > self._max_youtube_results:
1526                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1527                     n = self._max_youtube_results
1528                 self._download_n_results(query, n)
1529                 return
1530             except ValueError: # parsing prefix as integer fails
1531                 self._download_n_results(query, 1)
1532                 return
1533
1534     def _download_n_results(self, query, n):
1535         """Downloads a specified number of results for a query"""
1536
1537         video_ids = []
1538         pagenum = 0
1539         limit = n
1540
1541         while (50 * pagenum) < limit:
1542             self.report_download_page(query, pagenum+1)
1543             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1544             request = compat_urllib_request.Request(result_url)
1545             try:
1546                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1547             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549                 return
1550             api_response = json.loads(data)['data']
1551
1552             if not 'items' in api_response:
1553                 self._downloader.trouble(u'[youtube] No video results')
1554                 return
1555
1556             new_ids = list(video['id'] for video in api_response['items'])
1557             video_ids += new_ids
1558
1559             limit = min(n, api_response['totalItems'])
1560             pagenum += 1
1561
1562         if len(video_ids) > n:
1563             video_ids = video_ids[:n]
1564         for id in video_ids:
1565             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1566         return
1567
1568
1569 class GoogleSearchIE(InfoExtractor):
1570     """Information Extractor for Google Video search queries."""
1571     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1572     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1573     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1574     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1575     _max_google_results = 1000
1576     IE_NAME = u'video.google:search'
1577
1578     def __init__(self, downloader=None):
1579         InfoExtractor.__init__(self, downloader)
1580
1581     def report_download_page(self, query, pagenum):
1582         """Report attempt to download playlist page with given number."""
1583         query = query.decode(preferredencoding())
1584         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1585
1586     def _real_extract(self, query):
1587         mobj = re.match(self._VALID_URL, query)
1588         if mobj is None:
1589             self._downloader.report_error(u'invalid search query "%s"' % query)
1590             return
1591
1592         prefix, query = query.split(':')
1593         prefix = prefix[8:]
1594         query = query.encode('utf-8')
1595         if prefix == '':
1596             self._download_n_results(query, 1)
1597             return
1598         elif prefix == 'all':
1599             self._download_n_results(query, self._max_google_results)
1600             return
1601         else:
1602             try:
1603                 n = int(prefix)
1604                 if n <= 0:
1605                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606                     return
1607                 elif n > self._max_google_results:
1608                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1609                     n = self._max_google_results
1610                 self._download_n_results(query, n)
1611                 return
1612             except ValueError: # parsing prefix as integer fails
1613                 self._download_n_results(query, 1)
1614                 return
1615
1616     def _download_n_results(self, query, n):
1617         """Downloads a specified number of results for a query"""
1618
1619         video_ids = []
1620         pagenum = 0
1621
1622         while True:
1623             self.report_download_page(query, pagenum)
1624             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1625             request = compat_urllib_request.Request(result_url)
1626             try:
1627                 page = compat_urllib_request.urlopen(request).read()
1628             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1629                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1630                 return
1631
1632             # Extract video identifiers
1633             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1634                 video_id = mobj.group(1)
1635                 if video_id not in video_ids:
1636                     video_ids.append(video_id)
1637                     if len(video_ids) == n:
1638                         # Specified n videos reached
1639                         for id in video_ids:
1640                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1641                         return
1642
1643             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1644                 for id in video_ids:
1645                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1646                 return
1647
1648             pagenum = pagenum + 1
1649
1650
1651 class YahooSearchIE(InfoExtractor):
1652     """Information Extractor for Yahoo! Video search queries."""
1653
1654     _WORKING = False
1655     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1656     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1657     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1658     _MORE_PAGES_INDICATOR = r'\s*Next'
1659     _max_yahoo_results = 1000
1660     IE_NAME = u'video.yahoo:search'
1661
1662     def __init__(self, downloader=None):
1663         InfoExtractor.__init__(self, downloader)
1664
1665     def report_download_page(self, query, pagenum):
1666         """Report attempt to download playlist page with given number."""
1667         query = query.decode(preferredencoding())
1668         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1669
1670     def _real_extract(self, query):
1671         mobj = re.match(self._VALID_URL, query)
1672         if mobj is None:
1673             self._downloader.report_error(u'invalid search query "%s"' % query)
1674             return
1675
1676         prefix, query = query.split(':')
1677         prefix = prefix[8:]
1678         query = query.encode('utf-8')
1679         if prefix == '':
1680             self._download_n_results(query, 1)
1681             return
1682         elif prefix == 'all':
1683             self._download_n_results(query, self._max_yahoo_results)
1684             return
1685         else:
1686             try:
1687                 n = int(prefix)
1688                 if n <= 0:
1689                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1690                     return
1691                 elif n > self._max_yahoo_results:
1692                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1693                     n = self._max_yahoo_results
1694                 self._download_n_results(query, n)
1695                 return
1696             except ValueError: # parsing prefix as integer fails
1697                 self._download_n_results(query, 1)
1698                 return
1699
1700     def _download_n_results(self, query, n):
1701         """Downloads a specified number of results for a query"""
1702
1703         video_ids = []
1704         already_seen = set()
1705         pagenum = 1
1706
1707         while True:
1708             self.report_download_page(query, pagenum)
1709             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1710             request = compat_urllib_request.Request(result_url)
1711             try:
1712                 page = compat_urllib_request.urlopen(request).read()
1713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715                 return
1716
1717             # Extract video identifiers
1718             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1719                 video_id = mobj.group(1)
1720                 if video_id not in already_seen:
1721                     video_ids.append(video_id)
1722                     already_seen.add(video_id)
1723                     if len(video_ids) == n:
1724                         # Specified n videos reached
1725                         for id in video_ids:
1726                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1727                         return
1728
1729             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1730                 for id in video_ids:
1731                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1732                 return
1733
1734             pagenum = pagenum + 1
1735
1736
1737 class YoutubePlaylistIE(InfoExtractor):
1738     """Information Extractor for YouTube playlists."""
1739
1740     _VALID_URL = r"""(?:
1741                         (?:https?://)?
1742                         (?:\w+\.)?
1743                         youtube\.com/
1744                         (?:
1745                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1746                            \? (?:.*?&)*? (?:p|a|list)=
1747                         |  p/
1748                         )
1749                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1750                         .*
1751                      |
1752                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1753                      )"""
1754     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1755     _MAX_RESULTS = 50
1756     IE_NAME = u'youtube:playlist'
1757
1758     def __init__(self, downloader=None):
1759         InfoExtractor.__init__(self, downloader)
1760
1761     @classmethod
1762     def suitable(cls, url):
1763         """Receives a URL and returns True if suitable for this IE."""
1764         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1765
1766     def report_download_page(self, playlist_id, pagenum):
1767         """Report attempt to download playlist page with given number."""
1768         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1769
1770     def _real_extract(self, url):
1771         # Extract playlist id
1772         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1773         if mobj is None:
1774             self._downloader.report_error(u'invalid url: %s' % url)
1775             return
1776
1777         # Download playlist videos from API
1778         playlist_id = mobj.group(1) or mobj.group(2)
1779         page_num = 1
1780         videos = []
1781
1782         while True:
1783             self.report_download_page(playlist_id, page_num)
1784
1785             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1786             try:
1787                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1788             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1790                 return
1791
1792             try:
1793                 response = json.loads(page)
1794             except ValueError as err:
1795                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1796                 return
1797
1798             if not 'feed' in response or not 'entry' in response['feed']:
1799                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1800                 return
1801             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1802                         for entry in response['feed']['entry']
1803                         if 'content' in entry ]
1804
1805             if len(response['feed']['entry']) < self._MAX_RESULTS:
1806                 break
1807             page_num += 1
1808
1809         videos = [v[1] for v in sorted(videos)]
1810         total = len(videos)
1811
1812         playliststart = self._downloader.params.get('playliststart', 1) - 1
1813         playlistend = self._downloader.params.get('playlistend', -1)
1814         if playlistend == -1:
1815             videos = videos[playliststart:]
1816         else:
1817             videos = videos[playliststart:playlistend]
1818
1819         if len(videos) == total:
1820             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1821         else:
1822             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1823
1824         url_results = [self.url_result(url) for url in videos]
1825         return [self.playlist_result(url_results, playlist_id)]
1826
1827
1828 class YoutubeChannelIE(InfoExtractor):
1829     """Information Extractor for YouTube channels."""
1830
1831     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1832     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1833     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1834     IE_NAME = u'youtube:channel'
1835
1836     def report_download_page(self, channel_id, pagenum):
1837         """Report attempt to download channel page with given number."""
1838         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1839
1840     def _real_extract(self, url):
1841         # Extract channel id
1842         mobj = re.match(self._VALID_URL, url)
1843         if mobj is None:
1844             self._downloader.report_error(u'invalid url: %s' % url)
1845             return
1846
1847         # Download channel pages
1848         channel_id = mobj.group(1)
1849         video_ids = []
1850         pagenum = 1
1851
1852         while True:
1853             self.report_download_page(channel_id, pagenum)
1854             url = self._TEMPLATE_URL % (channel_id, pagenum)
1855             request = compat_urllib_request.Request(url)
1856             try:
1857                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1858             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1860                 return
1861
1862             # Extract video identifiers
1863             ids_in_page = []
1864             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1865                 if mobj.group(1) not in ids_in_page:
1866                     ids_in_page.append(mobj.group(1))
1867             video_ids.extend(ids_in_page)
1868
1869             if self._MORE_PAGES_INDICATOR not in page:
1870                 break
1871             pagenum = pagenum + 1
1872
1873         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1874
1875         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1876         url_entries = [self.url_result(url) for url in urls]
1877         return [self.playlist_result(url_entries, channel_id)]
1878
1879
1880 class YoutubeUserIE(InfoExtractor):
1881     """Information Extractor for YouTube users."""
1882
1883     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1884     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1885     _GDATA_PAGE_SIZE = 50
1886     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1887     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1888     IE_NAME = u'youtube:user'
1889
1890     def __init__(self, downloader=None):
1891         InfoExtractor.__init__(self, downloader)
1892
1893     def report_download_page(self, username, start_index):
1894         """Report attempt to download user page."""
1895         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1896                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1897
1898     def _real_extract(self, url):
1899         # Extract username
1900         mobj = re.match(self._VALID_URL, url)
1901         if mobj is None:
1902             self._downloader.report_error(u'invalid url: %s' % url)
1903             return
1904
1905         username = mobj.group(1)
1906
1907         # Download video ids using YouTube Data API. Result size per
1908         # query is limited (currently to 50 videos) so we need to query
1909         # page by page until there are no video ids - it means we got
1910         # all of them.
1911
1912         video_ids = []
1913         pagenum = 0
1914
1915         while True:
1916             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1917             self.report_download_page(username, start_index)
1918
1919             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1920
1921             try:
1922                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1923             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1925                 return
1926
1927             # Extract video identifiers
1928             ids_in_page = []
1929
1930             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1931                 if mobj.group(1) not in ids_in_page:
1932                     ids_in_page.append(mobj.group(1))
1933
1934             video_ids.extend(ids_in_page)
1935
1936             # A little optimization - if current page is not
1937             # "full", ie. does not contain PAGE_SIZE video ids then
1938             # we can assume that this page is the last one - there
1939             # are no more ids on further pages - no need to query
1940             # again.
1941
1942             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1943                 break
1944
1945             pagenum += 1
1946
1947         all_ids_count = len(video_ids)
1948         playliststart = self._downloader.params.get('playliststart', 1) - 1
1949         playlistend = self._downloader.params.get('playlistend', -1)
1950
1951         if playlistend == -1:
1952             video_ids = video_ids[playliststart:]
1953         else:
1954             video_ids = video_ids[playliststart:playlistend]
1955
1956         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1957                 (username, all_ids_count, len(video_ids)))
1958
1959         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1960         url_results = [self.url_result(url) for url in urls]
1961         return [self.playlist_result(url_results, playlist_title = username)]
1962
1963
1964 class BlipTVUserIE(InfoExtractor):
1965     """Information Extractor for blip.tv users."""
1966
1967     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1968     _PAGE_SIZE = 12
1969     IE_NAME = u'blip.tv:user'
1970
1971     def __init__(self, downloader=None):
1972         InfoExtractor.__init__(self, downloader)
1973
1974     def report_download_page(self, username, pagenum):
1975         """Report attempt to download user page."""
1976         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1977                 (self.IE_NAME, username, pagenum))
1978
1979     def _real_extract(self, url):
1980         # Extract username
1981         mobj = re.match(self._VALID_URL, url)
1982         if mobj is None:
1983             self._downloader.report_error(u'invalid url: %s' % url)
1984             return
1985
1986         username = mobj.group(1)
1987
1988         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1989
1990         request = compat_urllib_request.Request(url)
1991
1992         try:
1993             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1994             mobj = re.search(r'data-users-id="([^"]+)"', page)
1995             page_base = page_base % mobj.group(1)
1996         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1998             return
1999
2000
2001         # Download video ids using BlipTV Ajax calls. Result size per
2002         # query is limited (currently to 12 videos) so we need to query
2003         # page by page until there are no video ids - it means we got
2004         # all of them.
2005
2006         video_ids = []
2007         pagenum = 1
2008
2009         while True:
2010             self.report_download_page(username, pagenum)
2011             url = page_base + "&page=" + str(pagenum)
2012             request = compat_urllib_request.Request( url )
2013             try:
2014                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2015             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2017                 return
2018
2019             # Extract video identifiers
2020             ids_in_page = []
2021
2022             for mobj in re.finditer(r'href="/([^"]+)"', page):
2023                 if mobj.group(1) not in ids_in_page:
2024                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2025
2026             video_ids.extend(ids_in_page)
2027
2028             # A little optimization - if current page is not
2029             # "full", ie. does not contain PAGE_SIZE video ids then
2030             # we can assume that this page is the last one - there
2031             # are no more ids on further pages - no need to query
2032             # again.
2033
2034             if len(ids_in_page) < self._PAGE_SIZE:
2035                 break
2036
2037             pagenum += 1
2038
2039         all_ids_count = len(video_ids)
2040         playliststart = self._downloader.params.get('playliststart', 1) - 1
2041         playlistend = self._downloader.params.get('playlistend', -1)
2042
2043         if playlistend == -1:
2044             video_ids = video_ids[playliststart:]
2045         else:
2046             video_ids = video_ids[playliststart:playlistend]
2047
2048         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2049                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2050
2051         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2052         url_entries = [self.url_result(url) for url in urls]
2053         return [self.playlist_result(url_entries, playlist_title = username)]
2054
2055
2056 class DepositFilesIE(InfoExtractor):
2057     """Information extractor for depositfiles.com"""
2058
2059     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2060
2061     def report_download_webpage(self, file_id):
2062         """Report webpage download."""
2063         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2064
2065     def report_extraction(self, file_id):
2066         """Report information extraction."""
2067         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2068
2069     def _real_extract(self, url):
2070         file_id = url.split('/')[-1]
2071         # Rebuild url in english locale
2072         url = 'http://depositfiles.com/en/files/' + file_id
2073
2074         # Retrieve file webpage with 'Free download' button pressed
2075         free_download_indication = { 'gateway_result' : '1' }
2076         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2077         try:
2078             self.report_download_webpage(file_id)
2079             webpage = compat_urllib_request.urlopen(request).read()
2080         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2082             return
2083
2084         # Search for the real file URL
2085         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2086         if (mobj is None) or (mobj.group(1) is None):
2087             # Try to figure out reason of the error.
2088             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2089             if (mobj is not None) and (mobj.group(1) is not None):
2090                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2091                 self._downloader.report_error(u'%s' % restriction_message)
2092             else:
2093                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2094             return
2095
2096         file_url = mobj.group(1)
2097         file_extension = os.path.splitext(file_url)[1][1:]
2098
2099         # Search for file title
2100         mobj = re.search(r'<b title="(.*?)">', webpage)
2101         if mobj is None:
2102             self._downloader.report_error(u'unable to extract title')
2103             return
2104         file_title = mobj.group(1).decode('utf-8')
2105
2106         return [{
2107             'id':       file_id.decode('utf-8'),
2108             'url':      file_url.decode('utf-8'),
2109             'uploader': None,
2110             'upload_date':  None,
2111             'title':    file_title,
2112             'ext':      file_extension.decode('utf-8'),
2113         }]
2114
2115
2116 class FacebookIE(InfoExtractor):
2117     """Information Extractor for Facebook"""
2118
2119     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2120     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2121     _NETRC_MACHINE = 'facebook'
2122     IE_NAME = u'facebook'
2123
2124     def report_login(self):
2125         """Report attempt to log in."""
2126         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2127
2128     def _real_initialize(self):
2129         if self._downloader is None:
2130             return
2131
2132         useremail = None
2133         password = None
2134         downloader_params = self._downloader.params
2135
2136         # Attempt to use provided username and password or .netrc data
2137         if downloader_params.get('username', None) is not None:
2138             useremail = downloader_params['username']
2139             password = downloader_params['password']
2140         elif downloader_params.get('usenetrc', False):
2141             try:
2142                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2143                 if info is not None:
2144                     useremail = info[0]
2145                     password = info[2]
2146                 else:
2147                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2148             except (IOError, netrc.NetrcParseError) as err:
2149                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2150                 return
2151
2152         if useremail is None:
2153             return
2154
2155         # Log in
2156         login_form = {
2157             'email': useremail,
2158             'pass': password,
2159             'login': 'Log+In'
2160             }
2161         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2162         try:
2163             self.report_login()
2164             login_results = compat_urllib_request.urlopen(request).read()
2165             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2166                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2167                 return
2168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2169             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2170             return
2171
2172     def _real_extract(self, url):
2173         mobj = re.match(self._VALID_URL, url)
2174         if mobj is None:
2175             self._downloader.report_error(u'invalid URL: %s' % url)
2176             return
2177         video_id = mobj.group('ID')
2178
2179         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2180         webpage = self._download_webpage(url, video_id)
2181
2182         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2183         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2184         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2185         if not m:
2186             raise ExtractorError(u'Cannot parse data')
2187         data = dict(json.loads(m.group(1)))
2188         params_raw = compat_urllib_parse.unquote(data['params'])
2189         params = json.loads(params_raw)
2190         video_url = params['hd_src']
2191         if not video_url:
2192             video_url = params['sd_src']
2193         if not video_url:
2194             raise ExtractorError(u'Cannot find video URL')
2195         video_duration = int(params['video_duration'])
2196
2197         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2198         if not m:
2199             raise ExtractorError(u'Cannot find title in webpage')
2200         video_title = unescapeHTML(m.group(1))
2201
2202         info = {
2203             'id': video_id,
2204             'title': video_title,
2205             'url': video_url,
2206             'ext': 'mp4',
2207             'duration': video_duration,
2208             'thumbnail': params['thumbnail_src'],
2209         }
2210         return [info]
2211
2212
2213 class BlipTVIE(InfoExtractor):
2214     """Information extractor for blip.tv"""
2215
2216     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2217     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2218     IE_NAME = u'blip.tv'
2219
2220     def report_extraction(self, file_id):
2221         """Report information extraction."""
2222         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2223
2224     def report_direct_download(self, title):
2225         """Report information extraction."""
2226         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2227
2228     def _real_extract(self, url):
2229         mobj = re.match(self._VALID_URL, url)
2230         if mobj is None:
2231             self._downloader.report_error(u'invalid URL: %s' % url)
2232             return
2233
2234         urlp = compat_urllib_parse_urlparse(url)
2235         if urlp.path.startswith('/play/'):
2236             request = compat_urllib_request.Request(url)
2237             response = compat_urllib_request.urlopen(request)
2238             redirecturl = response.geturl()
2239             rurlp = compat_urllib_parse_urlparse(redirecturl)
2240             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2241             url = 'http://blip.tv/a/a-' + file_id
2242             return self._real_extract(url)
2243
2244
2245         if '?' in url:
2246             cchar = '&'
2247         else:
2248             cchar = '?'
2249         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2250         request = compat_urllib_request.Request(json_url)
2251         request.add_header('User-Agent', 'iTunes/10.6.1')
2252         self.report_extraction(mobj.group(1))
2253         info = None
2254         try:
2255             urlh = compat_urllib_request.urlopen(request)
2256             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2257                 basename = url.split('/')[-1]
2258                 title,ext = os.path.splitext(basename)
2259                 title = title.decode('UTF-8')
2260                 ext = ext.replace('.', '')
2261                 self.report_direct_download(title)
2262                 info = {
2263                     'id': title,
2264                     'url': url,
2265                     'uploader': None,
2266                     'upload_date': None,
2267                     'title': title,
2268                     'ext': ext,
2269                     'urlhandle': urlh
2270                 }
2271         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2272             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2273         if info is None: # Regular URL
2274             try:
2275                 json_code_bytes = urlh.read()
2276                 json_code = json_code_bytes.decode('utf-8')
2277             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2278                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2279                 return
2280
2281             try:
2282                 json_data = json.loads(json_code)
2283                 if 'Post' in json_data:
2284                     data = json_data['Post']
2285                 else:
2286                     data = json_data
2287
2288                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2289                 video_url = data['media']['url']
2290                 umobj = re.match(self._URL_EXT, video_url)
2291                 if umobj is None:
2292                     raise ValueError('Can not determine filename extension')
2293                 ext = umobj.group(1)
2294
2295                 info = {
2296                     'id': data['item_id'],
2297                     'url': video_url,
2298                     'uploader': data['display_name'],
2299                     'upload_date': upload_date,
2300                     'title': data['title'],
2301                     'ext': ext,
2302                     'format': data['media']['mimeType'],
2303                     'thumbnail': data['thumbnailUrl'],
2304                     'description': data['description'],
2305                     'player_url': data['embedUrl'],
2306                     'user_agent': 'iTunes/10.6.1',
2307                 }
2308             except (ValueError,KeyError) as err:
2309                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2310                 return
2311
2312         return [info]
2313
2314
2315 class MyVideoIE(InfoExtractor):
2316     """Information Extractor for myvideo.de."""
2317
2318     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2319     IE_NAME = u'myvideo'
2320
2321     def __init__(self, downloader=None):
2322         InfoExtractor.__init__(self, downloader)
2323
2324     def report_extraction(self, video_id):
2325         """Report information extraction."""
2326         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2327
2328     def _real_extract(self,url):
2329         mobj = re.match(self._VALID_URL, url)
2330         if mobj is None:
2331             self._download.report_error(u'invalid URL: %s' % url)
2332             return
2333
2334         video_id = mobj.group(1)
2335
2336         # Get video webpage
2337         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2338         webpage = self._download_webpage(webpage_url, video_id)
2339
2340         self.report_extraction(video_id)
2341         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2342                  webpage)
2343         if mobj is None:
2344             self._downloader.report_error(u'unable to extract media URL')
2345             return
2346         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2347
2348         mobj = re.search('<title>([^<]+)</title>', webpage)
2349         if mobj is None:
2350             self._downloader.report_error(u'unable to extract title')
2351             return
2352
2353         video_title = mobj.group(1)
2354
2355         return [{
2356             'id':       video_id,
2357             'url':      video_url,
2358             'uploader': None,
2359             'upload_date':  None,
2360             'title':    video_title,
2361             'ext':      u'flv',
2362         }]
2363
2364 class ComedyCentralIE(InfoExtractor):
2365     """Information extractor for The Daily Show and Colbert Report """
2366
2367     # urls can be abbreviations like :thedailyshow or :colbert
2368     # urls for episodes like:
2369     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2370     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2371     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2372     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2373                       |(https?://)?(www\.)?
2374                           (?P<showname>thedailyshow|colbertnation)\.com/
2375                          (full-episodes/(?P<episode>.*)|
2376                           (?P<clip>
2377                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2378                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2379                      $"""
2380
2381     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2382
2383     _video_extensions = {
2384         '3500': 'mp4',
2385         '2200': 'mp4',
2386         '1700': 'mp4',
2387         '1200': 'mp4',
2388         '750': 'mp4',
2389         '400': 'mp4',
2390     }
2391     _video_dimensions = {
2392         '3500': '1280x720',
2393         '2200': '960x540',
2394         '1700': '768x432',
2395         '1200': '640x360',
2396         '750': '512x288',
2397         '400': '384x216',
2398     }
2399
2400     @classmethod
2401     def suitable(cls, url):
2402         """Receives a URL and returns True if suitable for this IE."""
2403         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2404
2405     def report_extraction(self, episode_id):
2406         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2407
2408     def report_config_download(self, episode_id, media_id):
2409         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2410
2411     def report_index_download(self, episode_id):
2412         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2413
2414     def _print_formats(self, formats):
2415         print('Available formats:')
2416         for x in formats:
2417             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2418
2419
2420     def _real_extract(self, url):
2421         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422         if mobj is None:
2423             self._downloader.report_error(u'invalid URL: %s' % url)
2424             return
2425
2426         if mobj.group('shortname'):
2427             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2428                 url = u'http://www.thedailyshow.com/full-episodes/'
2429             else:
2430                 url = u'http://www.colbertnation.com/full-episodes/'
2431             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2432             assert mobj is not None
2433
2434         if mobj.group('clip'):
2435             if mobj.group('showname') == 'thedailyshow':
2436                 epTitle = mobj.group('tdstitle')
2437             else:
2438                 epTitle = mobj.group('cntitle')
2439             dlNewest = False
2440         else:
2441             dlNewest = not mobj.group('episode')
2442             if dlNewest:
2443                 epTitle = mobj.group('showname')
2444             else:
2445                 epTitle = mobj.group('episode')
2446
2447         req = compat_urllib_request.Request(url)
2448         self.report_extraction(epTitle)
2449         try:
2450             htmlHandle = compat_urllib_request.urlopen(req)
2451             html = htmlHandle.read()
2452             webpage = html.decode('utf-8')
2453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2455             return
2456         if dlNewest:
2457             url = htmlHandle.geturl()
2458             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2459             if mobj is None:
2460                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2461                 return
2462             if mobj.group('episode') == '':
2463                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2464                 return
2465             epTitle = mobj.group('episode')
2466
2467         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2468
2469         if len(mMovieParams) == 0:
2470             # The Colbert Report embeds the information in a without
2471             # a URL prefix; so extract the alternate reference
2472             # and then add the URL prefix manually.
2473
2474             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2475             if len(altMovieParams) == 0:
2476                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2477                 return
2478             else:
2479                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2480
2481         uri = mMovieParams[0][1]
2482         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2483         self.report_index_download(epTitle)
2484         try:
2485             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2486         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2488             return
2489
2490         results = []
2491
2492         idoc = xml.etree.ElementTree.fromstring(indexXml)
2493         itemEls = idoc.findall('.//item')
2494         for partNum,itemEl in enumerate(itemEls):
2495             mediaId = itemEl.findall('./guid')[0].text
2496             shortMediaId = mediaId.split(':')[-1]
2497             showId = mediaId.split(':')[-2].replace('.com', '')
2498             officialTitle = itemEl.findall('./title')[0].text
2499             officialDate = itemEl.findall('./pubDate')[0].text
2500
2501             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2502                         compat_urllib_parse.urlencode({'uri': mediaId}))
2503             configReq = compat_urllib_request.Request(configUrl)
2504             self.report_config_download(epTitle, shortMediaId)
2505             try:
2506                 configXml = compat_urllib_request.urlopen(configReq).read()
2507             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2508                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2509                 return
2510
2511             cdoc = xml.etree.ElementTree.fromstring(configXml)
2512             turls = []
2513             for rendition in cdoc.findall('.//rendition'):
2514                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2515                 turls.append(finfo)
2516
2517             if len(turls) == 0:
2518                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2519                 continue
2520
2521             if self._downloader.params.get('listformats', None):
2522                 self._print_formats([i[0] for i in turls])
2523                 return
2524
2525             # For now, just pick the highest bitrate
2526             format,rtmp_video_url = turls[-1]
2527
2528             # Get the format arg from the arg stream
2529             req_format = self._downloader.params.get('format', None)
2530
2531             # Select format if we can find one
2532             for f,v in turls:
2533                 if f == req_format:
2534                     format, rtmp_video_url = f, v
2535                     break
2536
2537             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2538             if not m:
2539                 raise ExtractorError(u'Cannot transform RTMP url')
2540             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2541             video_url = base + m.group('finalid')
2542
2543             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2544             info = {
2545                 'id': shortMediaId,
2546                 'url': video_url,
2547                 'uploader': showId,
2548                 'upload_date': officialDate,
2549                 'title': effTitle,
2550                 'ext': 'mp4',
2551                 'format': format,
2552                 'thumbnail': None,
2553                 'description': officialTitle,
2554             }
2555             results.append(info)
2556
2557         return results
2558
2559
2560 class EscapistIE(InfoExtractor):
2561     """Information extractor for The Escapist """
2562
2563     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2564     IE_NAME = u'escapist'
2565
2566     def report_extraction(self, showName):
2567         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2568
2569     def report_config_download(self, showName):
2570         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2571
2572     def _real_extract(self, url):
2573         mobj = re.match(self._VALID_URL, url)
2574         if mobj is None:
2575             self._downloader.report_error(u'invalid URL: %s' % url)
2576             return
2577         showName = mobj.group('showname')
2578         videoId = mobj.group('episode')
2579
2580         self.report_extraction(showName)
2581         try:
2582             webPage = compat_urllib_request.urlopen(url)
2583             webPageBytes = webPage.read()
2584             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2585             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2588             return
2589
2590         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2591         description = unescapeHTML(descMatch.group(1))
2592         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2593         imgUrl = unescapeHTML(imgMatch.group(1))
2594         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2595         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2596         configUrlMatch = re.search('config=(.*)$', playerUrl)
2597         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2598
2599         self.report_config_download(showName)
2600         try:
2601             configJSON = compat_urllib_request.urlopen(configUrl)
2602             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2603             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2604         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2606             return
2607
2608         # Technically, it's JavaScript, not JSON
2609         configJSON = configJSON.replace("'", '"')
2610
2611         try:
2612             config = json.loads(configJSON)
2613         except (ValueError,) as err:
2614             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2615             return
2616
2617         playlist = config['playlist']
2618         videoUrl = playlist[1]['url']
2619
2620         info = {
2621             'id': videoId,
2622             'url': videoUrl,
2623             'uploader': showName,
2624             'upload_date': None,
2625             'title': showName,
2626             'ext': 'mp4',
2627             'thumbnail': imgUrl,
2628             'description': description,
2629             'player_url': playerUrl,
2630         }
2631
2632         return [info]
2633
2634 class CollegeHumorIE(InfoExtractor):
2635     """Information extractor for collegehumor.com"""
2636
2637     _WORKING = False
2638     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2639     IE_NAME = u'collegehumor'
2640
2641     def report_manifest(self, video_id):
2642         """Report information extraction."""
2643         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2644
2645     def report_extraction(self, video_id):
2646         """Report information extraction."""
2647         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2648
2649     def _real_extract(self, url):
2650         mobj = re.match(self._VALID_URL, url)
2651         if mobj is None:
2652             self._downloader.report_error(u'invalid URL: %s' % url)
2653             return
2654         video_id = mobj.group('videoid')
2655
2656         info = {
2657             'id': video_id,
2658             'uploader': None,
2659             'upload_date': None,
2660         }
2661
2662         self.report_extraction(video_id)
2663         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2664         try:
2665             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2666         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2668             return
2669
2670         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2671         try:
2672             videoNode = mdoc.findall('./video')[0]
2673             info['description'] = videoNode.findall('./description')[0].text
2674             info['title'] = videoNode.findall('./caption')[0].text
2675             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2676             manifest_url = videoNode.findall('./file')[0].text
2677         except IndexError:
2678             self._downloader.report_error(u'Invalid metadata XML file')
2679             return
2680
2681         manifest_url += '?hdcore=2.10.3'
2682         self.report_manifest(video_id)
2683         try:
2684             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2685         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2686             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2687             return
2688
2689         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2690         try:
2691             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2692             node_id = media_node.attrib['url']
2693             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2694         except IndexError as err:
2695             self._downloader.report_error(u'Invalid manifest file')
2696             return
2697
2698         url_pr = compat_urllib_parse_urlparse(manifest_url)
2699         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2700
2701         info['url'] = url
2702         info['ext'] = 'f4f'
2703         return [info]
2704
2705
2706 class XVideosIE(InfoExtractor):
2707     """Information extractor for xvideos.com"""
2708
2709     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2710     IE_NAME = u'xvideos'
2711
2712     def report_extraction(self, video_id):
2713         """Report information extraction."""
2714         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2715
2716     def _real_extract(self, url):
2717         mobj = re.match(self._VALID_URL, url)
2718         if mobj is None:
2719             self._downloader.report_error(u'invalid URL: %s' % url)
2720             return
2721         video_id = mobj.group(1)
2722
2723         webpage = self._download_webpage(url, video_id)
2724
2725         self.report_extraction(video_id)
2726
2727
2728         # Extract video URL
2729         mobj = re.search(r'flv_url=(.+?)&', webpage)
2730         if mobj is None:
2731             self._downloader.report_error(u'unable to extract video url')
2732             return
2733         video_url = compat_urllib_parse.unquote(mobj.group(1))
2734
2735
2736         # Extract title
2737         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2738         if mobj is None:
2739             self._downloader.report_error(u'unable to extract video title')
2740             return
2741         video_title = mobj.group(1)
2742
2743
2744         # Extract video thumbnail
2745         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2746         if mobj is None:
2747             self._downloader.report_error(u'unable to extract video thumbnail')
2748             return
2749         video_thumbnail = mobj.group(0)
2750
2751         info = {
2752             'id': video_id,
2753             'url': video_url,
2754             'uploader': None,
2755             'upload_date': None,
2756             'title': video_title,
2757             'ext': 'flv',
2758             'thumbnail': video_thumbnail,
2759             'description': None,
2760         }
2761
2762         return [info]
2763
2764
2765 class SoundcloudIE(InfoExtractor):
2766     """Information extractor for soundcloud.com
2767        To access the media, the uid of the song and a stream token
2768        must be extracted from the page source and the script must make
2769        a request to media.soundcloud.com/crossdomain.xml. Then
2770        the media can be grabbed by requesting from an url composed
2771        of the stream token and uid
2772      """
2773
2774     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2775     IE_NAME = u'soundcloud'
2776
2777     def __init__(self, downloader=None):
2778         InfoExtractor.__init__(self, downloader)
2779
2780     def report_resolve(self, video_id):
2781         """Report information extraction."""
2782         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2783
2784     def report_extraction(self, video_id):
2785         """Report information extraction."""
2786         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2787
2788     def _real_extract(self, url):
2789         mobj = re.match(self._VALID_URL, url)
2790         if mobj is None:
2791             self._downloader.report_error(u'invalid URL: %s' % url)
2792             return
2793
2794         # extract uploader (which is in the url)
2795         uploader = mobj.group(1)
2796         # extract simple title (uploader + slug of song title)
2797         slug_title =  mobj.group(2)
2798         simple_title = uploader + u'-' + slug_title
2799
2800         self.report_resolve('%s/%s' % (uploader, slug_title))
2801
2802         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2803         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2804         request = compat_urllib_request.Request(resolv_url)
2805         try:
2806             info_json_bytes = compat_urllib_request.urlopen(request).read()
2807             info_json = info_json_bytes.decode('utf-8')
2808         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2809             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2810             return
2811
2812         info = json.loads(info_json)
2813         video_id = info['id']
2814         self.report_extraction('%s/%s' % (uploader, slug_title))
2815
2816         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2817         request = compat_urllib_request.Request(streams_url)
2818         try:
2819             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2820             stream_json = stream_json_bytes.decode('utf-8')
2821         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2822             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2823             return
2824
2825         streams = json.loads(stream_json)
2826         mediaURL = streams['http_mp3_128_url']
2827
2828         return [{
2829             'id':       info['id'],
2830             'url':      mediaURL,
2831             'uploader': info['user']['username'],
2832             'upload_date':  info['created_at'],
2833             'title':    info['title'],
2834             'ext':      u'mp3',
2835             'description': info['description'],
2836         }]
2837
2838 class SoundcloudSetIE(InfoExtractor):
2839     """Information extractor for soundcloud.com sets
2840        To access the media, the uid of the song and a stream token
2841        must be extracted from the page source and the script must make
2842        a request to media.soundcloud.com/crossdomain.xml. Then
2843        the media can be grabbed by requesting from an url composed
2844        of the stream token and uid
2845      """
2846
2847     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2848     IE_NAME = u'soundcloud'
2849
2850     def __init__(self, downloader=None):
2851         InfoExtractor.__init__(self, downloader)
2852
2853     def report_resolve(self, video_id):
2854         """Report information extraction."""
2855         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2856
2857     def report_extraction(self, video_id):
2858         """Report information extraction."""
2859         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2860
2861     def _real_extract(self, url):
2862         mobj = re.match(self._VALID_URL, url)
2863         if mobj is None:
2864             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2865             return
2866
2867         # extract uploader (which is in the url)
2868         uploader = mobj.group(1)
2869         # extract simple title (uploader + slug of song title)
2870         slug_title =  mobj.group(2)
2871         simple_title = uploader + u'-' + slug_title
2872
2873         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2874
2875         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2876         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2877         request = compat_urllib_request.Request(resolv_url)
2878         try:
2879             info_json_bytes = compat_urllib_request.urlopen(request).read()
2880             info_json = info_json_bytes.decode('utf-8')
2881         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2882             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2883             return
2884
2885         videos = []
2886         info = json.loads(info_json)
2887         if 'errors' in info:
2888             for err in info['errors']:
2889                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2890             return
2891
2892         for track in info['tracks']:
2893             video_id = track['id']
2894             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2895
2896             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2897             request = compat_urllib_request.Request(streams_url)
2898             try:
2899                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2900                 stream_json = stream_json_bytes.decode('utf-8')
2901             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2902                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2903                 return
2904
2905             streams = json.loads(stream_json)
2906             mediaURL = streams['http_mp3_128_url']
2907
2908             videos.append({
2909                 'id':       video_id,
2910                 'url':      mediaURL,
2911                 'uploader': track['user']['username'],
2912                 'upload_date':  track['created_at'],
2913                 'title':    track['title'],
2914                 'ext':      u'mp3',
2915                 'description': track['description'],
2916             })
2917         return videos
2918
2919
2920 class InfoQIE(InfoExtractor):
2921     """Information extractor for infoq.com"""
2922     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2923
2924     def report_extraction(self, video_id):
2925         """Report information extraction."""
2926         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2927
2928     def _real_extract(self, url):
2929         mobj = re.match(self._VALID_URL, url)
2930         if mobj is None:
2931             self._downloader.report_error(u'invalid URL: %s' % url)
2932             return
2933
2934         webpage = self._download_webpage(url, video_id=url)
2935         self.report_extraction(url)
2936
2937         # Extract video URL
2938         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2939         if mobj is None:
2940             self._downloader.report_error(u'unable to extract video url')
2941             return
2942         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2943         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2944
2945         # Extract title
2946         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2947         if mobj is None:
2948             self._downloader.report_error(u'unable to extract video title')
2949             return
2950         video_title = mobj.group(1)
2951
2952         # Extract description
2953         video_description = u'No description available.'
2954         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2955         if mobj is not None:
2956             video_description = mobj.group(1)
2957
2958         video_filename = video_url.split('/')[-1]
2959         video_id, extension = video_filename.split('.')
2960
2961         info = {
2962             'id': video_id,
2963             'url': video_url,
2964             'uploader': None,
2965             'upload_date': None,
2966             'title': video_title,
2967             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2968             'thumbnail': None,
2969             'description': video_description,
2970         }
2971
2972         return [info]
2973
2974 class MixcloudIE(InfoExtractor):
2975     """Information extractor for www.mixcloud.com"""
2976
2977     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2978     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2979     IE_NAME = u'mixcloud'
2980
2981     def __init__(self, downloader=None):
2982         InfoExtractor.__init__(self, downloader)
2983
2984     def report_download_json(self, file_id):
2985         """Report JSON download."""
2986         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2987
2988     def report_extraction(self, file_id):
2989         """Report information extraction."""
2990         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2991
2992     def get_urls(self, jsonData, fmt, bitrate='best'):
2993         """Get urls from 'audio_formats' section in json"""
2994         file_url = None
2995         try:
2996             bitrate_list = jsonData[fmt]
2997             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2998                 bitrate = max(bitrate_list) # select highest
2999
3000             url_list = jsonData[fmt][bitrate]
3001         except TypeError: # we have no bitrate info.
3002             url_list = jsonData[fmt]
3003         return url_list
3004
3005     def check_urls(self, url_list):
3006         """Returns 1st active url from list"""
3007         for url in url_list:
3008             try:
3009                 compat_urllib_request.urlopen(url)
3010                 return url
3011             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012                 url = None
3013
3014         return None
3015
3016     def _print_formats(self, formats):
3017         print('Available formats:')
3018         for fmt in formats.keys():
3019             for b in formats[fmt]:
3020                 try:
3021                     ext = formats[fmt][b][0]
3022                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3023                 except TypeError: # we have no bitrate info
3024                     ext = formats[fmt][0]
3025                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3026                     break
3027
3028     def _real_extract(self, url):
3029         mobj = re.match(self._VALID_URL, url)
3030         if mobj is None:
3031             self._downloader.report_error(u'invalid URL: %s' % url)
3032             return
3033         # extract uploader & filename from url
3034         uploader = mobj.group(1).decode('utf-8')
3035         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3036
3037         # construct API request
3038         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3039         # retrieve .json file with links to files
3040         request = compat_urllib_request.Request(file_url)
3041         try:
3042             self.report_download_json(file_url)
3043             jsonData = compat_urllib_request.urlopen(request).read()
3044         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3045             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3046             return
3047
3048         # parse JSON
3049         json_data = json.loads(jsonData)
3050         player_url = json_data['player_swf_url']
3051         formats = dict(json_data['audio_formats'])
3052
3053         req_format = self._downloader.params.get('format', None)
3054         bitrate = None
3055
3056         if self._downloader.params.get('listformats', None):
3057             self._print_formats(formats)
3058             return
3059
3060         if req_format is None or req_format == 'best':
3061             for format_param in formats.keys():
3062                 url_list = self.get_urls(formats, format_param)
3063                 # check urls
3064                 file_url = self.check_urls(url_list)
3065                 if file_url is not None:
3066                     break # got it!
3067         else:
3068             if req_format not in formats:
3069                 self._downloader.report_error(u'format is not available')
3070                 return
3071
3072             url_list = self.get_urls(formats, req_format)
3073             file_url = self.check_urls(url_list)
3074             format_param = req_format
3075
3076         return [{
3077             'id': file_id.decode('utf-8'),
3078             'url': file_url.decode('utf-8'),
3079             'uploader': uploader.decode('utf-8'),
3080             'upload_date': None,
3081             'title': json_data['name'],
3082             'ext': file_url.split('.')[-1].decode('utf-8'),
3083             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3084             'thumbnail': json_data['thumbnail_url'],
3085             'description': json_data['description'],
3086             'player_url': player_url.decode('utf-8'),
3087         }]
3088
3089 class StanfordOpenClassroomIE(InfoExtractor):
3090     """Information extractor for Stanford's Open ClassRoom"""
3091
3092     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3093     IE_NAME = u'stanfordoc'
3094
3095     def report_download_webpage(self, objid):
3096         """Report information extraction."""
3097         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3098
3099     def report_extraction(self, video_id):
3100         """Report information extraction."""
3101         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3102
3103     def _real_extract(self, url):
3104         mobj = re.match(self._VALID_URL, url)
3105         if mobj is None:
3106             raise ExtractorError(u'Invalid URL: %s' % url)
3107
3108         if mobj.group('course') and mobj.group('video'): # A specific video
3109             course = mobj.group('course')
3110             video = mobj.group('video')
3111             info = {
3112                 'id': course + '_' + video,
3113                 'uploader': None,
3114                 'upload_date': None,
3115             }
3116
3117             self.report_extraction(info['id'])
3118             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3119             xmlUrl = baseUrl + video + '.xml'
3120             try:
3121                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3122             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3123                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3124                 return
3125             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3126             try:
3127                 info['title'] = mdoc.findall('./title')[0].text
3128                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3129             except IndexError:
3130                 self._downloader.report_error(u'Invalid metadata XML file')
3131                 return
3132             info['ext'] = info['url'].rpartition('.')[2]
3133             return [info]
3134         elif mobj.group('course'): # A course page
3135             course = mobj.group('course')
3136             info = {
3137                 'id': course,
3138                 'type': 'playlist',
3139                 'uploader': None,
3140                 'upload_date': None,
3141             }
3142
3143             coursepage = self._download_webpage(url, info['id'],
3144                                         note='Downloading course info page',
3145                                         errnote='Unable to download course info page')
3146
3147             m = re.search('<h1>([^<]+)</h1>', coursepage)
3148             if m:
3149                 info['title'] = unescapeHTML(m.group(1))
3150             else:
3151                 info['title'] = info['id']
3152
3153             m = re.search('<description>([^<]+)</description>', coursepage)
3154             if m:
3155                 info['description'] = unescapeHTML(m.group(1))
3156
3157             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3158             info['list'] = [
3159                 {
3160                     'type': 'reference',
3161                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3162                 }
3163                     for vpage in links]
3164             results = []
3165             for entry in info['list']:
3166                 assert entry['type'] == 'reference'
3167                 results += self.extract(entry['url'])
3168             return results
3169         else: # Root page
3170             info = {
3171                 'id': 'Stanford OpenClassroom',
3172                 'type': 'playlist',
3173                 'uploader': None,
3174                 'upload_date': None,
3175             }
3176
3177             self.report_download_webpage(info['id'])
3178             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3179             try:
3180                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3181             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3182                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3183                 return
3184
3185             info['title'] = info['id']
3186
3187             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3188             info['list'] = [
3189                 {
3190                     'type': 'reference',
3191                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3192                 }
3193                     for cpage in links]
3194
3195             results = []
3196             for entry in info['list']:
3197                 assert entry['type'] == 'reference'
3198                 results += self.extract(entry['url'])
3199             return results
3200
3201 class MTVIE(InfoExtractor):
3202     """Information extractor for MTV.com"""
3203
3204     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3205     IE_NAME = u'mtv'
3206
3207     def report_extraction(self, video_id):
3208         """Report information extraction."""
3209         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3210
3211     def _real_extract(self, url):
3212         mobj = re.match(self._VALID_URL, url)
3213         if mobj is None:
3214             self._downloader.report_error(u'invalid URL: %s' % url)
3215             return
3216         if not mobj.group('proto'):
3217             url = 'http://' + url
3218         video_id = mobj.group('videoid')
3219
3220         webpage = self._download_webpage(url, video_id)
3221
3222         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3223         if mobj is None:
3224             self._downloader.report_error(u'unable to extract song name')
3225             return
3226         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3227         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3228         if mobj is None:
3229             self._downloader.report_error(u'unable to extract performer')
3230             return
3231         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3232         video_title = performer + ' - ' + song_name
3233
3234         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3235         if mobj is None:
3236             self._downloader.report_error(u'unable to mtvn_uri')
3237             return
3238         mtvn_uri = mobj.group(1)
3239
3240         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3241         if mobj is None:
3242             self._downloader.report_error(u'unable to extract content id')
3243             return
3244         content_id = mobj.group(1)
3245
3246         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3247         self.report_extraction(video_id)
3248         request = compat_urllib_request.Request(videogen_url)
3249         try:
3250             metadataXml = compat_urllib_request.urlopen(request).read()
3251         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3252             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3253             return
3254
3255         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3256         renditions = mdoc.findall('.//rendition')
3257
3258         # For now, always pick the highest quality.
3259         rendition = renditions[-1]
3260
3261         try:
3262             _,_,ext = rendition.attrib['type'].partition('/')
3263             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3264             video_url = rendition.find('./src').text
3265         except KeyError:
3266             self._downloader.trouble('Invalid rendition field.')
3267             return
3268
3269         info = {
3270             'id': video_id,
3271             'url': video_url,
3272             'uploader': performer,
3273             'upload_date': None,
3274             'title': video_title,
3275             'ext': ext,
3276             'format': format,
3277         }
3278
3279         return [info]
3280
3281
3282 class YoukuIE(InfoExtractor):
3283     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3284
3285     def report_download_webpage(self, file_id):
3286         """Report webpage download."""
3287         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3288
3289     def report_extraction(self, file_id):
3290         """Report information extraction."""
3291         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3292
3293     def _gen_sid(self):
3294         nowTime = int(time.time() * 1000)
3295         random1 = random.randint(1000,1998)
3296         random2 = random.randint(1000,9999)
3297
3298         return "%d%d%d" %(nowTime,random1,random2)
3299
3300     def _get_file_ID_mix_string(self, seed):
3301         mixed = []
3302         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3303         seed = float(seed)
3304         for i in range(len(source)):
3305             seed  =  (seed * 211 + 30031 ) % 65536
3306             index  =  math.floor(seed / 65536 * len(source) )
3307             mixed.append(source[int(index)])
3308             source.remove(source[int(index)])
3309         #return ''.join(mixed)
3310         return mixed
3311
3312     def _get_file_id(self, fileId, seed):
3313         mixed = self._get_file_ID_mix_string(seed)
3314         ids = fileId.split('*')
3315         realId = []
3316         for ch in ids:
3317             if ch:
3318                 realId.append(mixed[int(ch)])
3319         return ''.join(realId)
3320
3321     def _real_extract(self, url):
3322         mobj = re.match(self._VALID_URL, url)
3323         if mobj is None:
3324             self._downloader.report_error(u'invalid URL: %s' % url)
3325             return
3326         video_id = mobj.group('ID')
3327
3328         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3329
3330         request = compat_urllib_request.Request(info_url, None, std_headers)
3331         try:
3332             self.report_download_webpage(video_id)
3333             jsondata = compat_urllib_request.urlopen(request).read()
3334         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3335             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3336             return
3337
3338         self.report_extraction(video_id)
3339         try:
3340             jsonstr = jsondata.decode('utf-8')
3341             config = json.loads(jsonstr)
3342
3343             video_title =  config['data'][0]['title']
3344             seed = config['data'][0]['seed']
3345
3346             format = self._downloader.params.get('format', None)
3347             supported_format = list(config['data'][0]['streamfileids'].keys())
3348
3349             if format is None or format == 'best':
3350                 if 'hd2' in supported_format:
3351                     format = 'hd2'
3352                 else:
3353                     format = 'flv'
3354                 ext = u'flv'
3355             elif format == 'worst':
3356                 format = 'mp4'
3357                 ext = u'mp4'
3358             else:
3359                 format = 'flv'
3360                 ext = u'flv'
3361
3362
3363             fileid = config['data'][0]['streamfileids'][format]
3364             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3365         except (UnicodeDecodeError, ValueError, KeyError):
3366             self._downloader.report_error(u'unable to extract info section')
3367             return
3368
3369         files_info=[]
3370         sid = self._gen_sid()
3371         fileid = self._get_file_id(fileid, seed)
3372
3373         #column 8,9 of fileid represent the segment number
3374         #fileid[7:9] should be changed
3375         for index, key in enumerate(keys):
3376
3377             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3378             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3379
3380             info = {
3381                 'id': '%s_part%02d' % (video_id, index),
3382                 'url': download_url,
3383                 'uploader': None,
3384                 'upload_date': None,
3385                 'title': video_title,
3386                 'ext': ext,
3387             }
3388             files_info.append(info)
3389
3390         return files_info
3391
3392
3393 class XNXXIE(InfoExtractor):
3394     """Information extractor for xnxx.com"""
3395
3396     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3397     IE_NAME = u'xnxx'
3398     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3399     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3400     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3401
3402     def report_webpage(self, video_id):
3403         """Report information extraction"""
3404         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3405
3406     def report_extraction(self, video_id):
3407         """Report information extraction"""
3408         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3409
3410     def _real_extract(self, url):
3411         mobj = re.match(self._VALID_URL, url)
3412         if mobj is None:
3413             self._downloader.report_error(u'invalid URL: %s' % url)
3414             return
3415         video_id = mobj.group(1)
3416
3417         self.report_webpage(video_id)
3418
3419         # Get webpage content
3420         try:
3421             webpage_bytes = compat_urllib_request.urlopen(url).read()
3422             webpage = webpage_bytes.decode('utf-8')
3423         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3424             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3425             return
3426
3427         result = re.search(self.VIDEO_URL_RE, webpage)
3428         if result is None:
3429             self._downloader.report_error(u'unable to extract video url')
3430             return
3431         video_url = compat_urllib_parse.unquote(result.group(1))
3432
3433         result = re.search(self.VIDEO_TITLE_RE, webpage)
3434         if result is None:
3435             self._downloader.report_error(u'unable to extract video title')
3436             return
3437         video_title = result.group(1)
3438
3439         result = re.search(self.VIDEO_THUMB_RE, webpage)
3440         if result is None:
3441             self._downloader.report_error(u'unable to extract video thumbnail')
3442             return
3443         video_thumbnail = result.group(1)
3444
3445         return [{
3446             'id': video_id,
3447             'url': video_url,
3448             'uploader': None,
3449             'upload_date': None,
3450             'title': video_title,
3451             'ext': 'flv',
3452             'thumbnail': video_thumbnail,
3453             'description': None,
3454         }]
3455
3456
3457 class GooglePlusIE(InfoExtractor):
3458     """Information extractor for plus.google.com."""
3459
3460     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3461     IE_NAME = u'plus.google'
3462
3463     def __init__(self, downloader=None):
3464         InfoExtractor.__init__(self, downloader)
3465
3466     def report_extract_entry(self, url):
3467         """Report downloading extry"""
3468         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3469
3470     def report_date(self, upload_date):
3471         """Report downloading extry"""
3472         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3473
3474     def report_uploader(self, uploader):
3475         """Report downloading extry"""
3476         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3477
3478     def report_title(self, video_title):
3479         """Report downloading extry"""
3480         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3481
3482     def report_extract_vid_page(self, video_page):
3483         """Report information extraction."""
3484         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3485
3486     def _real_extract(self, url):
3487         # Extract id from URL
3488         mobj = re.match(self._VALID_URL, url)
3489         if mobj is None:
3490             self._downloader.report_error(u'Invalid URL: %s' % url)
3491             return
3492
3493         post_url = mobj.group(0)
3494         video_id = mobj.group(1)
3495
3496         video_extension = 'flv'
3497
3498         # Step 1, Retrieve post webpage to extract further information
3499         self.report_extract_entry(post_url)
3500         request = compat_urllib_request.Request(post_url)
3501         try:
3502             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3504             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3505             return
3506
3507         # Extract update date
3508         upload_date = None
3509         pattern = 'title="Timestamp">(.*?)</a>'
3510         mobj = re.search(pattern, webpage)
3511         if mobj:
3512             upload_date = mobj.group(1)
3513             # Convert timestring to a format suitable for filename
3514             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3515             upload_date = upload_date.strftime('%Y%m%d')
3516         self.report_date(upload_date)
3517
3518         # Extract uploader
3519         uploader = None
3520         pattern = r'rel\="author".*?>(.*?)</a>'
3521         mobj = re.search(pattern, webpage)
3522         if mobj:
3523             uploader = mobj.group(1)
3524         self.report_uploader(uploader)
3525
3526         # Extract title
3527         # Get the first line for title
3528         video_title = u'NA'
3529         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3530         mobj = re.search(pattern, webpage)
3531         if mobj:
3532             video_title = mobj.group(1)
3533         self.report_title(video_title)
3534
3535         # Step 2, Stimulate clicking the image box to launch video
3536         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3537         mobj = re.search(pattern, webpage)
3538         if mobj is None:
3539             self._downloader.report_error(u'unable to extract video page URL')
3540
3541         video_page = mobj.group(1)
3542         request = compat_urllib_request.Request(video_page)
3543         try:
3544             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3545         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3547             return
3548         self.report_extract_vid_page(video_page)
3549
3550
3551         # Extract video links on video page
3552         """Extract video links of all sizes"""
3553         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3554         mobj = re.findall(pattern, webpage)
3555         if len(mobj) == 0:
3556             self._downloader.report_error(u'unable to extract video links')
3557
3558         # Sort in resolution
3559         links = sorted(mobj)
3560
3561         # Choose the lowest of the sort, i.e. highest resolution
3562         video_url = links[-1]
3563         # Only get the url. The resolution part in the tuple has no use anymore
3564         video_url = video_url[-1]
3565         # Treat escaped \u0026 style hex
3566         try:
3567             video_url = video_url.decode("unicode_escape")
3568         except AttributeError: # Python 3
3569             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3570
3571
3572         return [{
3573             'id':       video_id,
3574             'url':      video_url,
3575             'uploader': uploader,
3576             'upload_date':  upload_date,
3577             'title':    video_title,
3578             'ext':      video_extension,
3579         }]
3580
3581 class NBAIE(InfoExtractor):
3582     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3583     IE_NAME = u'nba'
3584
3585     def _real_extract(self, url):
3586         mobj = re.match(self._VALID_URL, url)
3587         if mobj is None:
3588             self._downloader.report_error(u'invalid URL: %s' % url)
3589             return
3590
3591         video_id = mobj.group(1)
3592         if video_id.endswith('/index.html'):
3593             video_id = video_id[:-len('/index.html')]
3594
3595         webpage = self._download_webpage(url, video_id)
3596
3597         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3598         def _findProp(rexp, default=None):
3599             m = re.search(rexp, webpage)
3600             if m:
3601                 return unescapeHTML(m.group(1))
3602             else:
3603                 return default
3604
3605         shortened_video_id = video_id.rpartition('/')[2]
3606         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3607         info = {
3608             'id': shortened_video_id,
3609             'url': video_url,
3610             'ext': 'mp4',
3611             'title': title,
3612             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3613             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3614         }
3615         return [info]
3616
3617 class JustinTVIE(InfoExtractor):
3618     """Information extractor for justin.tv and twitch.tv"""
3619     # TODO: One broadcast may be split into multiple videos. The key
3620     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3621     # starts at 1 and increases. Can we treat all parts as one video?
3622
3623     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3624         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3625     _JUSTIN_PAGE_LIMIT = 100
3626     IE_NAME = u'justin.tv'
3627
3628     def report_extraction(self, file_id):
3629         """Report information extraction."""
3630         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3631
3632     def report_download_page(self, channel, offset):
3633         """Report attempt to download a single page of videos."""
3634         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3635                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3636
3637     # Return count of items, list of *valid* items
3638     def _parse_page(self, url):
3639         try:
3640             urlh = compat_urllib_request.urlopen(url)
3641             webpage_bytes = urlh.read()
3642             webpage = webpage_bytes.decode('utf-8', 'ignore')
3643         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3644             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3645             return
3646
3647         response = json.loads(webpage)
3648         if type(response) != list:
3649             error_text = response.get('error', 'unknown error')
3650             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3651             return
3652         info = []
3653         for clip in response:
3654             video_url = clip['video_file_url']
3655             if video_url:
3656                 video_extension = os.path.splitext(video_url)[1][1:]
3657                 video_date = re.sub('-', '', clip['start_time'][:10])
3658                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3659                 video_id = clip['id']
3660                 video_title = clip.get('title', video_id)
3661                 info.append({
3662                     'id': video_id,
3663                     'url': video_url,
3664                     'title': video_title,
3665                     'uploader': clip.get('channel_name', video_uploader_id),
3666                     'uploader_id': video_uploader_id,
3667                     'upload_date': video_date,
3668                     'ext': video_extension,
3669                 })
3670         return (len(response), info)
3671
3672     def _real_extract(self, url):
3673         mobj = re.match(self._VALID_URL, url)
3674         if mobj is None:
3675             self._downloader.report_error(u'invalid URL: %s' % url)
3676             return
3677
3678         api = 'http://api.justin.tv'
3679         video_id = mobj.group(mobj.lastindex)
3680         paged = False
3681         if mobj.lastindex == 1:
3682             paged = True
3683             api += '/channel/archives/%s.json'
3684         else:
3685             api += '/broadcast/by_archive/%s.json'
3686         api = api % (video_id,)
3687
3688         self.report_extraction(video_id)
3689
3690         info = []
3691         offset = 0
3692         limit = self._JUSTIN_PAGE_LIMIT
3693         while True:
3694             if paged:
3695                 self.report_download_page(video_id, offset)
3696             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3697             page_count, page_info = self._parse_page(page_url)
3698             info.extend(page_info)
3699             if not paged or page_count != limit:
3700                 break
3701             offset += limit
3702         return info
3703
3704 class FunnyOrDieIE(InfoExtractor):
3705     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3706
3707     def _real_extract(self, url):
3708         mobj = re.match(self._VALID_URL, url)
3709         if mobj is None:
3710             self._downloader.report_error(u'invalid URL: %s' % url)
3711             return
3712
3713         video_id = mobj.group('id')
3714         webpage = self._download_webpage(url, video_id)
3715
3716         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3717         if not m:
3718             self._downloader.report_error(u'unable to find video information')
3719         video_url = unescapeHTML(m.group('url'))
3720
3721         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3722         if not m:
3723             self._downloader.trouble(u'Cannot find video title')
3724         title = clean_html(m.group('title'))
3725
3726         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3727         if m:
3728             desc = unescapeHTML(m.group('desc'))
3729         else:
3730             desc = None
3731
3732         info = {
3733             'id': video_id,
3734             'url': video_url,
3735             'ext': 'mp4',
3736             'title': title,
3737             'description': desc,
3738         }
3739         return [info]
3740
3741 class SteamIE(InfoExtractor):
3742     _VALID_URL = r"""http://store.steampowered.com/
3743                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3744                 (?P<gameID>\d+)/?
3745                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3746                 """
3747
3748     @classmethod
3749     def suitable(cls, url):
3750         """Receives a URL and returns True if suitable for this IE."""
3751         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3752
3753     def _real_extract(self, url):
3754         m = re.match(self._VALID_URL, url, re.VERBOSE)
3755         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3756         gameID = m.group('gameID')
3757         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3758         webpage = self._download_webpage(videourl, gameID)
3759         mweb = re.finditer(urlRE, webpage)
3760         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3761         titles = re.finditer(namesRE, webpage)
3762         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3763         thumbs = re.finditer(thumbsRE, webpage)
3764         videos = []
3765         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3766             video_id = vid.group('videoID')
3767             title = vtitle.group('videoName')
3768             video_url = vid.group('videoURL')
3769             video_thumb = thumb.group('thumbnail')
3770             if not video_url:
3771                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3772             info = {
3773                 'id':video_id,
3774                 'url':video_url,
3775                 'ext': 'flv',
3776                 'title': unescapeHTML(title),
3777                 'thumbnail': video_thumb
3778                   }
3779             videos.append(info)
3780         return videos
3781
3782 class UstreamIE(InfoExtractor):
3783     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3784     IE_NAME = u'ustream'
3785
3786     def _real_extract(self, url):
3787         m = re.match(self._VALID_URL, url)
3788         video_id = m.group('videoID')
3789         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3790         webpage = self._download_webpage(url, video_id)
3791         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3792         title = m.group('title')
3793         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3794         uploader = m.group('uploader')
3795         info = {
3796                 'id':video_id,
3797                 'url':video_url,
3798                 'ext': 'flv',
3799                 'title': title,
3800                 'uploader': uploader
3801                   }
3802         return [info]
3803
3804 class WorldStarHipHopIE(InfoExtractor):
3805     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3806     IE_NAME = u'WorldStarHipHop'
3807
3808     def _real_extract(self, url):
3809         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3810
3811         webpage_src = compat_urllib_request.urlopen(url).read()
3812         webpage_src = webpage_src.decode('utf-8')
3813
3814         mobj = re.search(_src_url, webpage_src)
3815
3816         m = re.match(self._VALID_URL, url)
3817         video_id = m.group('id')
3818
3819         if mobj is not None:
3820             video_url = mobj.group()
3821             if 'mp4' in video_url:
3822                 ext = 'mp4'
3823             else:
3824                 ext = 'flv'
3825         else:
3826             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3827             return
3828
3829         _title = r"""<title>(.*)</title>"""
3830
3831         mobj = re.search(_title, webpage_src)
3832
3833         if mobj is not None:
3834             title = mobj.group(1)
3835         else:
3836             title = 'World Start Hip Hop - %s' % time.ctime()
3837
3838         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3839         mobj = re.search(_thumbnail, webpage_src)
3840
3841         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3842         if mobj is not None:
3843             thumbnail = mobj.group(1)
3844         else:
3845             _title = r"""candytitles.*>(.*)</span>"""
3846             mobj = re.search(_title, webpage_src)
3847             if mobj is not None:
3848                 title = mobj.group(1)
3849             thumbnail = None
3850
3851         results = [{
3852                     'id': video_id,
3853                     'url' : video_url,
3854                     'title' : title,
3855                     'thumbnail' : thumbnail,
3856                     'ext' : ext,
3857                     }]
3858         return results
3859
3860 class RBMARadioIE(InfoExtractor):
3861     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3862
3863     def _real_extract(self, url):
3864         m = re.match(self._VALID_URL, url)
3865         video_id = m.group('videoID')
3866
3867         webpage = self._download_webpage(url, video_id)
3868         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3869         if not m:
3870             raise ExtractorError(u'Cannot find metadata')
3871         json_data = m.group(1)
3872
3873         try:
3874             data = json.loads(json_data)
3875         except ValueError as e:
3876             raise ExtractorError(u'Invalid JSON: ' + str(e))
3877
3878         video_url = data['akamai_url'] + '&cbr=256'
3879         url_parts = compat_urllib_parse_urlparse(video_url)
3880         video_ext = url_parts.path.rpartition('.')[2]
3881         info = {
3882                 'id': video_id,
3883                 'url': video_url,
3884                 'ext': video_ext,
3885                 'title': data['title'],
3886                 'description': data.get('teaser_text'),
3887                 'location': data.get('country_of_origin'),
3888                 'uploader': data.get('host', {}).get('name'),
3889                 'uploader_id': data.get('host', {}).get('slug'),
3890                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3891                 'duration': data.get('duration'),
3892         }
3893         return [info]
3894
3895
3896 class YouPornIE(InfoExtractor):
3897     """Information extractor for youporn.com."""
3898     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3899
3900     def _print_formats(self, formats):
3901         """Print all available formats"""
3902         print(u'Available formats:')
3903         print(u'ext\t\tformat')
3904         print(u'---------------------------------')
3905         for format in formats:
3906             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3907
3908     def _specific(self, req_format, formats):
3909         for x in formats:
3910             if(x["format"]==req_format):
3911                 return x
3912         return None
3913
3914     def _real_extract(self, url):
3915         mobj = re.match(self._VALID_URL, url)
3916         if mobj is None:
3917             self._downloader.report_error(u'invalid URL: %s' % url)
3918             return
3919
3920         video_id = mobj.group('videoid')
3921
3922         req = compat_urllib_request.Request(url)
3923         req.add_header('Cookie', 'age_verified=1')
3924         webpage = self._download_webpage(req, video_id)
3925
3926         # Get the video title
3927         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3928         if result is None:
3929             raise ExtractorError(u'Unable to extract video title')
3930         video_title = result.group('title').strip()
3931
3932         # Get the video date
3933         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3934         if result is None:
3935             self._downloader.report_warning(u'unable to extract video date')
3936             upload_date = None
3937         else:
3938             upload_date = result.group('date').strip()
3939
3940         # Get the video uploader
3941         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3942         if result is None:
3943             self._downloader.report_warning(u'unable to extract uploader')
3944             video_uploader = None
3945         else:
3946             video_uploader = result.group('uploader').strip()
3947             video_uploader = clean_html( video_uploader )
3948
3949         # Get all of the formats available
3950         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3951         result = re.search(DOWNLOAD_LIST_RE, webpage)
3952         if result is None:
3953             raise ExtractorError(u'Unable to extract download list')
3954         download_list_html = result.group('download_list').strip()
3955
3956         # Get all of the links from the page
3957         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3958         links = re.findall(LINK_RE, download_list_html)
3959         if(len(links) == 0):
3960             raise ExtractorError(u'ERROR: no known formats available for video')
3961
3962         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3963
3964         formats = []
3965         for link in links:
3966
3967             # A link looks like this:
3968             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3969             # A path looks like this:
3970             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3971             video_url = unescapeHTML( link )
3972             path = compat_urllib_parse_urlparse( video_url ).path
3973             extension = os.path.splitext( path )[1][1:]
3974             format = path.split('/')[4].split('_')[:2]
3975             size = format[0]
3976             bitrate = format[1]
3977             format = "-".join( format )
3978             title = u'%s-%s-%s' % (video_title, size, bitrate)
3979
3980             formats.append({
3981                 'id': video_id,
3982                 'url': video_url,
3983                 'uploader': video_uploader,
3984                 'upload_date': upload_date,
3985                 'title': title,
3986                 'ext': extension,
3987                 'format': format,
3988                 'thumbnail': None,
3989                 'description': None,
3990                 'player_url': None
3991             })
3992
3993         if self._downloader.params.get('listformats', None):
3994             self._print_formats(formats)
3995             return
3996
3997         req_format = self._downloader.params.get('format', None)
3998         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3999
4000         if req_format is None or req_format == 'best':
4001             return [formats[0]]
4002         elif req_format == 'worst':
4003             return [formats[-1]]
4004         elif req_format in ('-1', 'all'):
4005             return formats
4006         else:
4007             format = self._specific( req_format, formats )
4008             if result is None:
4009                 self._downloader.report_error(u'requested format not available')
4010                 return
4011             return [format]
4012
4013
4014
4015 class PornotubeIE(InfoExtractor):
4016     """Information extractor for pornotube.com."""
4017     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4018
4019     def _real_extract(self, url):
4020         mobj = re.match(self._VALID_URL, url)
4021         if mobj is None:
4022             self._downloader.report_error(u'invalid URL: %s' % url)
4023             return
4024
4025         video_id = mobj.group('videoid')
4026         video_title = mobj.group('title')
4027
4028         # Get webpage content
4029         webpage = self._download_webpage(url, video_id)
4030
4031         # Get the video URL
4032         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4033         result = re.search(VIDEO_URL_RE, webpage)
4034         if result is None:
4035             self._downloader.report_error(u'unable to extract video url')
4036             return
4037         video_url = compat_urllib_parse.unquote(result.group('url'))
4038
4039         #Get the uploaded date
4040         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4041         result = re.search(VIDEO_UPLOADED_RE, webpage)
4042         if result is None:
4043             self._downloader.report_error(u'unable to extract video title')
4044             return
4045         upload_date = result.group('date')
4046
4047         info = {'id': video_id,
4048                 'url': video_url,
4049                 'uploader': None,
4050                 'upload_date': upload_date,
4051                 'title': video_title,
4052                 'ext': 'flv',
4053                 'format': 'flv'}
4054
4055         return [info]
4056
4057 class YouJizzIE(InfoExtractor):
4058     """Information extractor for youjizz.com."""
4059     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4060
4061     def _real_extract(self, url):
4062         mobj = re.match(self._VALID_URL, url)
4063         if mobj is None:
4064             self._downloader.report_error(u'invalid URL: %s' % url)
4065             return
4066
4067         video_id = mobj.group('videoid')
4068
4069         # Get webpage content
4070         webpage = self._download_webpage(url, video_id)
4071
4072         # Get the video title
4073         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4074         if result is None:
4075             raise ExtractorError(u'ERROR: unable to extract video title')
4076         video_title = result.group('title').strip()
4077
4078         # Get the embed page
4079         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4080         if result is None:
4081             raise ExtractorError(u'ERROR: unable to extract embed page')
4082
4083         embed_page_url = result.group(0).strip()
4084         video_id = result.group('videoid')
4085
4086         webpage = self._download_webpage(embed_page_url, video_id)
4087
4088         # Get the video URL
4089         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4090         if result is None:
4091             raise ExtractorError(u'ERROR: unable to extract video url')
4092         video_url = result.group('source')
4093
4094         info = {'id': video_id,
4095                 'url': video_url,
4096                 'title': video_title,
4097                 'ext': 'flv',
4098                 'format': 'flv',
4099                 'player_url': embed_page_url}
4100
4101         return [info]
4102
4103 class EightTracksIE(InfoExtractor):
4104     IE_NAME = '8tracks'
4105     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4106
4107     def _real_extract(self, url):
4108         mobj = re.match(self._VALID_URL, url)
4109         if mobj is None:
4110             raise ExtractorError(u'Invalid URL: %s' % url)
4111         playlist_id = mobj.group('id')
4112
4113         webpage = self._download_webpage(url, playlist_id)
4114
4115         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4116         if not m:
4117             raise ExtractorError(u'Cannot find trax information')
4118         json_like = m.group(1)
4119         data = json.loads(json_like)
4120
4121         session = str(random.randint(0, 1000000000))
4122         mix_id = data['id']
4123         track_count = data['tracks_count']
4124         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4125         next_url = first_url
4126         res = []
4127         for i in itertools.count():
4128             api_json = self._download_webpage(next_url, playlist_id,
4129                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4130                 errnote=u'Failed to download song information')
4131             api_data = json.loads(api_json)
4132             track_data = api_data[u'set']['track']
4133             info = {
4134                 'id': track_data['id'],
4135                 'url': track_data['track_file_stream_url'],
4136                 'title': track_data['performer'] + u' - ' + track_data['name'],
4137                 'raw_title': track_data['name'],
4138                 'uploader_id': data['user']['login'],
4139                 'ext': 'm4a',
4140             }
4141             res.append(info)
4142             if api_data['set']['at_last_track']:
4143                 break
4144             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4145         return res
4146
4147 class KeekIE(InfoExtractor):
4148     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4149     IE_NAME = u'keek'
4150
4151     def _real_extract(self, url):
4152         m = re.match(self._VALID_URL, url)
4153         video_id = m.group('videoID')
4154         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4155         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4156         webpage = self._download_webpage(url, video_id)
4157         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4158         title = unescapeHTML(m.group('title'))
4159         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4160         uploader = clean_html(m.group('uploader'))
4161         info = {
4162                 'id': video_id,
4163                 'url': video_url,
4164                 'ext': 'mp4',
4165                 'title': title,
4166                 'thumbnail': thumbnail,
4167                 'uploader': uploader
4168         }
4169         return [info]
4170
4171 class TEDIE(InfoExtractor):
4172     _VALID_URL=r'''http://www.ted.com/
4173                    (
4174                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4175                         |
4176                         ((?P<type_talk>talks)) # We have a simple talk
4177                    )
4178                    /(?P<name>\w+) # Here goes the name and then ".html"
4179                    '''
4180
4181     @classmethod
4182     def suitable(cls, url):
4183         """Receives a URL and returns True if suitable for this IE."""
4184         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4185
4186     def _real_extract(self, url):
4187         m=re.match(self._VALID_URL, url, re.VERBOSE)
4188         if m.group('type_talk'):
4189             return [self._talk_info(url)]
4190         else :
4191             playlist_id=m.group('playlist_id')
4192             name=m.group('name')
4193             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4194             return self._playlist_videos_info(url,name,playlist_id)
4195
4196     def _talk_video_link(self,mediaSlug):
4197         '''Returns the video link for that mediaSlug'''
4198         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4199
4200     def _playlist_videos_info(self,url,name,playlist_id=0):
4201         '''Returns the videos of the playlist'''
4202         video_RE=r'''
4203                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4204                      ([.\s]*?)data-playlist_item_id="(\d+)"
4205                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4206                      '''
4207         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4208         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4209         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4210         m_names=re.finditer(video_name_RE,webpage)
4211         info=[]
4212         for m_video, m_name in zip(m_videos,m_names):
4213             video_id=m_video.group('video_id')
4214             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4215             info.append(self._talk_info(talk_url,video_id))
4216         return info
4217
4218     def _talk_info(self, url, video_id=0):
4219         """Return the video for the talk in the url"""
4220         m=re.match(self._VALID_URL, url,re.VERBOSE)
4221         videoName=m.group('name')
4222         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4223         # If the url includes the language we get the title translated
4224         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4225         title=re.search(title_RE, webpage).group('title')
4226         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4227                         "id":(?P<videoID>[\d]+).*?
4228                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4229         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4230         thumb_match=re.search(thumb_RE,webpage)
4231         info_match=re.search(info_RE,webpage,re.VERBOSE)
4232         video_id=info_match.group('videoID')
4233         mediaSlug=info_match.group('mediaSlug')
4234         video_url=self._talk_video_link(mediaSlug)
4235         info = {
4236                 'id': video_id,
4237                 'url': video_url,
4238                 'ext': 'mp4',
4239                 'title': title,
4240                 'thumbnail': thumb_match.group('thumbnail')
4241                 }
4242         return info
4243
4244 class MySpassIE(InfoExtractor):
4245     _VALID_URL = r'http://www.myspass.de/.*'
4246
4247     def _real_extract(self, url):
4248         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4249
4250         # video id is the last path element of the URL
4251         # usually there is a trailing slash, so also try the second but last
4252         url_path = compat_urllib_parse_urlparse(url).path
4253         url_parent_path, video_id = os.path.split(url_path)
4254         if not video_id:
4255             _, video_id = os.path.split(url_parent_path)
4256
4257         # get metadata
4258         metadata_url = META_DATA_URL_TEMPLATE % video_id
4259         metadata_text = self._download_webpage(metadata_url, video_id)
4260         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4261
4262         # extract values from metadata
4263         url_flv_el = metadata.find('url_flv')
4264         if url_flv_el is None:
4265             self._downloader.report_error(u'unable to extract download url')
4266             return
4267         video_url = url_flv_el.text
4268         extension = os.path.splitext(video_url)[1][1:]
4269         title_el = metadata.find('title')
4270         if title_el is None:
4271             self._downloader.report_error(u'unable to extract title')
4272             return
4273         title = title_el.text
4274         format_id_el = metadata.find('format_id')
4275         if format_id_el is None:
4276             format = ext
4277         else:
4278             format = format_id_el.text
4279         description_el = metadata.find('description')
4280         if description_el is not None:
4281             description = description_el.text
4282         else:
4283             description = None
4284         imagePreview_el = metadata.find('imagePreview')
4285         if imagePreview_el is not None:
4286             thumbnail = imagePreview_el.text
4287         else:
4288             thumbnail = None
4289         info = {
4290             'id': video_id,
4291             'url': video_url,
4292             'title': title,
4293             'ext': extension,
4294             'format': format,
4295             'thumbnail': thumbnail,
4296             'description': description
4297         }
4298         return [info]
4299
4300 class SpiegelIE(InfoExtractor):
4301     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4302
4303     def _real_extract(self, url):
4304         m = re.match(self._VALID_URL, url)
4305         video_id = m.group('videoID')
4306
4307         webpage = self._download_webpage(url, video_id)
4308         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4309         if not m:
4310             raise ExtractorError(u'Cannot find title')
4311         video_title = unescapeHTML(m.group(1))
4312
4313         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4314         xml_code = self._download_webpage(xml_url, video_id,
4315                     note=u'Downloading XML', errnote=u'Failed to download XML')
4316
4317         idoc = xml.etree.ElementTree.fromstring(xml_code)
4318         last_type = idoc[-1]
4319         filename = last_type.findall('./filename')[0].text
4320         duration = float(last_type.findall('./duration')[0].text)
4321
4322         video_url = 'http://video2.spiegel.de/flash/' + filename
4323         video_ext = filename.rpartition('.')[2]
4324         info = {
4325             'id': video_id,
4326             'url': video_url,
4327             'ext': video_ext,
4328             'title': video_title,
4329             'duration': duration,
4330         }
4331         return [info]
4332
4333 class LiveLeakIE(InfoExtractor):
4334
4335     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4336     IE_NAME = u'liveleak'
4337
4338     def _real_extract(self, url):
4339         mobj = re.match(self._VALID_URL, url)
4340         if mobj is None:
4341             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4342             return
4343
4344         video_id = mobj.group('video_id')
4345
4346         webpage = self._download_webpage(url, video_id)
4347
4348         m = re.search(r'file: "(.*?)",', webpage)
4349         if not m:
4350             self._downloader.report_error(u'unable to find video url')
4351             return
4352         video_url = m.group(1)
4353
4354         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4355         if not m:
4356             self._downloader.trouble(u'Cannot find video title')
4357         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4358
4359         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4360         if m:
4361             desc = unescapeHTML(m.group('desc'))
4362         else:
4363             desc = None
4364
4365         m = re.search(r'By:.*?(\w+)</a>', webpage)
4366         if m:
4367             uploader = clean_html(m.group(1))
4368         else:
4369             uploader = None
4370
4371         info = {
4372             'id':  video_id,
4373             'url': video_url,
4374             'ext': 'mp4',
4375             'title': title,
4376             'description': desc,
4377             'uploader': uploader
4378         }
4379
4380         return [info]
4381
4382
4383 def gen_extractors():
4384     """ Return a list of an instance of every supported extractor.
4385     The order does matter; the first extractor matched is the one handling the URL.
4386     """
4387     return [
4388         YoutubePlaylistIE(),
4389         YoutubeChannelIE(),
4390         YoutubeUserIE(),
4391         YoutubeSearchIE(),
4392         YoutubeIE(),
4393         MetacafeIE(),
4394         DailymotionIE(),
4395         GoogleSearchIE(),
4396         PhotobucketIE(),
4397         YahooIE(),
4398         YahooSearchIE(),
4399         DepositFilesIE(),
4400         FacebookIE(),
4401         BlipTVUserIE(),
4402         BlipTVIE(),
4403         VimeoIE(),
4404         MyVideoIE(),
4405         ComedyCentralIE(),
4406         EscapistIE(),
4407         CollegeHumorIE(),
4408         XVideosIE(),
4409         SoundcloudSetIE(),
4410         SoundcloudIE(),
4411         InfoQIE(),
4412         MixcloudIE(),
4413         StanfordOpenClassroomIE(),
4414         MTVIE(),
4415         YoukuIE(),
4416         XNXXIE(),
4417         YouJizzIE(),
4418         PornotubeIE(),
4419         YouPornIE(),
4420         GooglePlusIE(),
4421         ArteTvIE(),
4422         NBAIE(),
4423         WorldStarHipHopIE(),
4424         JustinTVIE(),
4425         FunnyOrDieIE(),
4426         SteamIE(),
4427         UstreamIE(),
4428         RBMARadioIE(),
4429         EightTracksIE(),
4430         KeekIE(),
4431         TEDIE(),
4432         MySpassIE(),
4433         SpiegelIE(),
4434         LiveLeakIE(),
4435         GenericIE()
4436     ]