Add a playlist_index key to the info_dict, can be used in the output template
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137         
138     #Methods for following #608
139     #They set the correct value of the '_type' key
140     def video_result(self, video_info):
141         """Returns a video"""
142         video_info['_type'] = 'video'
143         return video_info
144     def url_result(self, url, ie=None):
145         """Returns a url that points to a page that should be processed"""
146         #TODO: ie should be the class used for getting the info
147         video_info = {'_type': 'url',
148                       'url': url}
149         return video_info
150     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151         """Returns a playlist"""
152         video_info = {'_type': 'playlist',
153                       'entries': entries}
154         if playlist_id:
155             video_info['id'] = playlist_id
156         if playlist_title:
157             video_info['title'] = playlist_title
158         return video_info
159
160
161 class YoutubeIE(InfoExtractor):
162     """Information extractor for youtube.com."""
163
164     _VALID_URL = r"""^
165                      (
166                          (?:https?://)?                                       # http(s):// (optional)
167                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
169                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
170                          (?:                                                  # the various things that can precede the ID:
171                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
172                              |(?:                                             # or the v= param in all its forms
173                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
175                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
176                                  v=
177                              )
178                          )?                                                   # optional -> youtube.com/xxxx is OK
179                      )?                                                       # all until now is optional -> you can pass the naked ID
180                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
181                      (?(1).+)?                                                # if we found the ID, everything can follow
182                      $"""
183     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187     _NETRC_MACHINE = 'youtube'
188     # Listed in order of quality
189     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191     _video_extensions = {
192         '13': '3gp',
193         '17': 'mp4',
194         '18': 'mp4',
195         '22': 'mp4',
196         '37': 'mp4',
197         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
198         '43': 'webm',
199         '44': 'webm',
200         '45': 'webm',
201         '46': 'webm',
202     }
203     _video_dimensions = {
204         '5': '240x400',
205         '6': '???',
206         '13': '???',
207         '17': '144x176',
208         '18': '360x640',
209         '22': '720x1280',
210         '34': '360x640',
211         '35': '480x854',
212         '37': '1080x1920',
213         '38': '3072x4096',
214         '43': '360x640',
215         '44': '480x854',
216         '45': '720x1280',
217         '46': '1080x1920',
218     }
219     IE_NAME = u'youtube'
220
221     @classmethod
222     def suitable(cls, url):
223         """Receives a URL and returns True if suitable for this IE."""
224         if YoutubePlaylistIE.suitable(url): return False
225         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
226
227     def report_lang(self):
228         """Report attempt to set language."""
229         self._downloader.to_screen(u'[youtube] Setting language')
230
231     def report_login(self):
232         """Report attempt to log in."""
233         self._downloader.to_screen(u'[youtube] Logging in')
234
235     def report_age_confirmation(self):
236         """Report attempt to confirm age."""
237         self._downloader.to_screen(u'[youtube] Confirming age')
238
239     def report_video_webpage_download(self, video_id):
240         """Report attempt to download video webpage."""
241         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
242
243     def report_video_info_webpage_download(self, video_id):
244         """Report attempt to download video info webpage."""
245         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
246
247     def report_video_subtitles_download(self, video_id):
248         """Report attempt to download video info webpage."""
249         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
250
251     def report_video_subtitles_request(self, video_id, sub_lang, format):
252         """Report attempt to download video info webpage."""
253         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
254
255     def report_video_subtitles_available(self, video_id, sub_lang_list):
256         """Report available subtitles."""
257         sub_lang = ",".join(list(sub_lang_list.keys()))
258         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
259
260     def report_information_extraction(self, video_id):
261         """Report attempt to extract video information."""
262         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
263
264     def report_unavailable_format(self, video_id, format):
265         """Report extracted video URL."""
266         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
267
268     def report_rtmp_download(self):
269         """Indicate the download will use the RTMP protocol."""
270         self._downloader.to_screen(u'[youtube] RTMP download detected')
271
272     def _get_available_subtitles(self, video_id):
273         self.report_video_subtitles_download(video_id)
274         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
275         try:
276             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278             return (u'unable to download video subtitles: %s' % compat_str(err), None)
279         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281         if not sub_lang_list:
282             return (u'video doesn\'t have subtitles', None)
283         return sub_lang_list
284
285     def _list_available_subtitles(self, video_id):
286         sub_lang_list = self._get_available_subtitles(video_id)
287         self.report_video_subtitles_available(video_id, sub_lang_list)
288
289     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
290         """
291         Return tuple:
292         (error_message, sub_lang, sub)
293         """
294         self.report_video_subtitles_request(video_id, sub_lang, format)
295         params = compat_urllib_parse.urlencode({
296             'lang': sub_lang,
297             'name': sub_name,
298             'v': video_id,
299             'fmt': format,
300         })
301         url = 'http://www.youtube.com/api/timedtext?' + params
302         try:
303             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
306         if not sub:
307             return (u'Did not fetch video subtitles', None, None)
308         return (None, sub_lang, sub)
309
310     def _extract_subtitle(self, video_id):
311         """
312         Return a list with a tuple:
313         [(error_message, sub_lang, sub)]
314         """
315         sub_lang_list = self._get_available_subtitles(video_id)
316         sub_format = self._downloader.params.get('subtitlesformat')
317         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
318             return [(sub_lang_list[0], None, None)]
319         if self._downloader.params.get('subtitleslang', False):
320             sub_lang = self._downloader.params.get('subtitleslang')
321         elif 'en' in sub_lang_list:
322             sub_lang = 'en'
323         else:
324             sub_lang = list(sub_lang_list.keys())[0]
325         if not sub_lang in sub_lang_list:
326             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
327
328         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
329         return [subtitle]
330
331     def _extract_all_subtitles(self, video_id):
332         sub_lang_list = self._get_available_subtitles(video_id)
333         sub_format = self._downloader.params.get('subtitlesformat')
334         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
335             return [(sub_lang_list[0], None, None)]
336         subtitles = []
337         for sub_lang in sub_lang_list:
338             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339             subtitles.append(subtitle)
340         return subtitles
341
342     def _print_formats(self, formats):
343         print('Available formats:')
344         for x in formats:
345             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
346
347     def _real_initialize(self):
348         if self._downloader is None:
349             return
350
351         username = None
352         password = None
353         downloader_params = self._downloader.params
354
355         # Attempt to use provided username and password or .netrc data
356         if downloader_params.get('username', None) is not None:
357             username = downloader_params['username']
358             password = downloader_params['password']
359         elif downloader_params.get('usenetrc', False):
360             try:
361                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
362                 if info is not None:
363                     username = info[0]
364                     password = info[2]
365                 else:
366                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367             except (IOError, netrc.NetrcParseError) as err:
368                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
369                 return
370
371         # Set language
372         request = compat_urllib_request.Request(self._LANG_URL)
373         try:
374             self.report_lang()
375             compat_urllib_request.urlopen(request).read()
376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
378             return
379
380         # No authentication to be performed
381         if username is None:
382             return
383
384         request = compat_urllib_request.Request(self._LOGIN_URL)
385         try:
386             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
389             return
390
391         galx = None
392         dsh = None
393         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
394         if match:
395           galx = match.group(1)
396
397         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
398         if match:
399           dsh = match.group(1)
400
401         # Log in
402         login_form_strs = {
403                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
404                 u'Email': username,
405                 u'GALX': galx,
406                 u'Passwd': password,
407                 u'PersistentCookie': u'yes',
408                 u'_utf8': u'霱',
409                 u'bgresponse': u'js_disabled',
410                 u'checkConnection': u'',
411                 u'checkedDomains': u'youtube',
412                 u'dnConn': u'',
413                 u'dsh': dsh,
414                 u'pstMsg': u'0',
415                 u'rmShown': u'1',
416                 u'secTok': u'',
417                 u'signIn': u'Sign in',
418                 u'timeStmp': u'',
419                 u'service': u'youtube',
420                 u'uilel': u'3',
421                 u'hl': u'en_US',
422         }
423         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
424         # chokes on unicode
425         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
426         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
427         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
428         try:
429             self.report_login()
430             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
432                 self._downloader.report_warning(u'unable to log in: bad username or password')
433                 return
434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
436             return
437
438         # Confirm age
439         age_form = {
440                 'next_url':     '/',
441                 'action_confirm':   'Confirm',
442                 }
443         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
444         try:
445             self.report_age_confirmation()
446             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
448             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
449             return
450
451     def _extract_id(self, url):
452         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
453         if mobj is None:
454             self._downloader.report_error(u'invalid URL: %s' % url)
455             return
456         video_id = mobj.group(2)
457         return video_id
458
459     def _real_extract(self, url):
460         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
461         mobj = re.search(self._NEXT_URL_RE, url)
462         if mobj:
463             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
464         video_id = self._extract_id(url)
465
466         # Get video webpage
467         self.report_video_webpage_download(video_id)
468         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
469         request = compat_urllib_request.Request(url)
470         try:
471             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
473             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
474             return
475
476         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
477
478         # Attempt to extract SWF player URL
479         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
480         if mobj is not None:
481             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
482         else:
483             player_url = None
484
485         # Get video info
486         self.report_video_info_webpage_download(video_id)
487         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
488             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
489                     % (video_id, el_type))
490             request = compat_urllib_request.Request(video_info_url)
491             try:
492                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
493                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
494                 video_info = compat_parse_qs(video_info_webpage)
495                 if 'token' in video_info:
496                     break
497             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
499                 return
500         if 'token' not in video_info:
501             if 'reason' in video_info:
502                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
503             else:
504                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
505             return
506
507         # Check for "rental" videos
508         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
509             self._downloader.report_error(u'"rental" videos not supported')
510             return
511
512         # Start extracting information
513         self.report_information_extraction(video_id)
514
515         # uploader
516         if 'author' not in video_info:
517             self._downloader.report_error(u'unable to extract uploader name')
518             return
519         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
520
521         # uploader_id
522         video_uploader_id = None
523         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
524         if mobj is not None:
525             video_uploader_id = mobj.group(1)
526         else:
527             self._downloader.report_warning(u'unable to extract uploader nickname')
528
529         # title
530         if 'title' not in video_info:
531             self._downloader.report_error(u'unable to extract video title')
532             return
533         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
534
535         # thumbnail image
536         if 'thumbnail_url' not in video_info:
537             self._downloader.report_warning(u'unable to extract video thumbnail')
538             video_thumbnail = ''
539         else:   # don't panic if we can't find it
540             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
541
542         # upload date
543         upload_date = None
544         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
545         if mobj is not None:
546             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
547             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
548             for expression in format_expressions:
549                 try:
550                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
551                 except:
552                     pass
553
554         # description
555         video_description = get_element_by_id("eow-description", video_webpage)
556         if video_description:
557             video_description = clean_html(video_description)
558         else:
559             video_description = ''
560
561         # subtitles
562         video_subtitles = None
563
564         if self._downloader.params.get('writesubtitles', False):
565             video_subtitles = self._extract_subtitle(video_id)
566             if video_subtitles:
567                 (sub_error, sub_lang, sub) = video_subtitles[0]
568                 if sub_error:
569                     self._downloader.report_error(sub_error)
570
571         if self._downloader.params.get('allsubtitles', False):
572             video_subtitles = self._extract_all_subtitles(video_id)
573             for video_subtitle in video_subtitles:
574                 (sub_error, sub_lang, sub) = video_subtitle
575                 if sub_error:
576                     self._downloader.report_error(sub_error)
577
578         if self._downloader.params.get('listsubtitles', False):
579             sub_lang_list = self._list_available_subtitles(video_id)
580             return
581
582         if 'length_seconds' not in video_info:
583             self._downloader.report_warning(u'unable to extract video duration')
584             video_duration = ''
585         else:
586             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
587
588         # token
589         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
590
591         # Decide which formats to download
592         req_format = self._downloader.params.get('format', None)
593
594         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
595             self.report_rtmp_download()
596             video_url_list = [(None, video_info['conn'][0])]
597         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
598             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
599             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
600             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
601             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
602
603             format_limit = self._downloader.params.get('format_limit', None)
604             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
605             if format_limit is not None and format_limit in available_formats:
606                 format_list = available_formats[available_formats.index(format_limit):]
607             else:
608                 format_list = available_formats
609             existing_formats = [x for x in format_list if x in url_map]
610             if len(existing_formats) == 0:
611                 self._downloader.report_error(u'no known formats available for video')
612                 return
613             if self._downloader.params.get('listformats', None):
614                 self._print_formats(existing_formats)
615                 return
616             if req_format is None or req_format == 'best':
617                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
618             elif req_format == 'worst':
619                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
620             elif req_format in ('-1', 'all'):
621                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
622             else:
623                 # Specific formats. We pick the first in a slash-delimeted sequence.
624                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
625                 req_formats = req_format.split('/')
626                 video_url_list = None
627                 for rf in req_formats:
628                     if rf in url_map:
629                         video_url_list = [(rf, url_map[rf])]
630                         break
631                 if video_url_list is None:
632                     self._downloader.report_error(u'requested format not available')
633                     return
634         else:
635             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
636             return
637
638         results = []
639         for format_param, video_real_url in video_url_list:
640             # Extension
641             video_extension = self._video_extensions.get(format_param, 'flv')
642
643             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
644                                               self._video_dimensions.get(format_param, '???'))
645
646             results.append({
647                 'id':       video_id,
648                 'url':      video_real_url,
649                 'uploader': video_uploader,
650                 'uploader_id': video_uploader_id,
651                 'upload_date':  upload_date,
652                 'title':    video_title,
653                 'ext':      video_extension,
654                 'format':   video_format,
655                 'thumbnail':    video_thumbnail,
656                 'description':  video_description,
657                 'player_url':   player_url,
658                 'subtitles':    video_subtitles,
659                 'duration':     video_duration
660             })
661         return results
662
663
664 class MetacafeIE(InfoExtractor):
665     """Information Extractor for metacafe.com."""
666
667     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
668     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
669     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
670     IE_NAME = u'metacafe'
671
672     def __init__(self, downloader=None):
673         InfoExtractor.__init__(self, downloader)
674
675     def report_disclaimer(self):
676         """Report disclaimer retrieval."""
677         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
678
679     def report_age_confirmation(self):
680         """Report attempt to confirm age."""
681         self._downloader.to_screen(u'[metacafe] Confirming age')
682
683     def report_download_webpage(self, video_id):
684         """Report webpage download."""
685         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
686
687     def report_extraction(self, video_id):
688         """Report information extraction."""
689         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
690
691     def _real_initialize(self):
692         # Retrieve disclaimer
693         request = compat_urllib_request.Request(self._DISCLAIMER)
694         try:
695             self.report_disclaimer()
696             disclaimer = compat_urllib_request.urlopen(request).read()
697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
699             return
700
701         # Confirm age
702         disclaimer_form = {
703             'filters': '0',
704             'submit': "Continue - I'm over 18",
705             }
706         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707         try:
708             self.report_age_confirmation()
709             disclaimer = compat_urllib_request.urlopen(request).read()
710         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
712             return
713
714     def _real_extract(self, url):
715         # Extract id and simplified title from URL
716         mobj = re.match(self._VALID_URL, url)
717         if mobj is None:
718             self._downloader.report_error(u'invalid URL: %s' % url)
719             return
720
721         video_id = mobj.group(1)
722
723         # Check if video comes from YouTube
724         mobj2 = re.match(r'^yt-(.*)$', video_id)
725         if mobj2 is not None:
726             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
727
728         # Retrieve video webpage to extract further information
729         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
730         try:
731             self.report_download_webpage(video_id)
732             webpage = compat_urllib_request.urlopen(request).read()
733         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
734             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
735             return
736
737         # Extract URL, uploader and title from webpage
738         self.report_extraction(video_id)
739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
740         if mobj is not None:
741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742             video_extension = mediaURL[-3:]
743
744             # Extract gdaKey if available
745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
746             if mobj is None:
747                 video_url = mediaURL
748             else:
749                 gdaKey = mobj.group(1)
750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
751         else:
752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
753             if mobj is None:
754                 self._downloader.report_error(u'unable to extract media URL')
755                 return
756             vardict = compat_parse_qs(mobj.group(1))
757             if 'mediaData' not in vardict:
758                 self._downloader.report_error(u'unable to extract media URL')
759                 return
760             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
761             if mobj is None:
762                 self._downloader.report_error(u'unable to extract media URL')
763                 return
764             mediaURL = mobj.group(1).replace('\\/', '/')
765             video_extension = mediaURL[-3:]
766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
767
768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
769         if mobj is None:
770             self._downloader.report_error(u'unable to extract title')
771             return
772         video_title = mobj.group(1).decode('utf-8')
773
774         mobj = re.search(r'submitter=(.*?);', webpage)
775         if mobj is None:
776             self._downloader.report_error(u'unable to extract uploader nickname')
777             return
778         video_uploader = mobj.group(1)
779
780         return [{
781             'id':       video_id.decode('utf-8'),
782             'url':      video_url.decode('utf-8'),
783             'uploader': video_uploader.decode('utf-8'),
784             'upload_date':  None,
785             'title':    video_title,
786             'ext':      video_extension.decode('utf-8'),
787         }]
788
789
790 class DailymotionIE(InfoExtractor):
791     """Information Extractor for Dailymotion"""
792
793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794     IE_NAME = u'dailymotion'
795     _WORKING = False
796
797     def __init__(self, downloader=None):
798         InfoExtractor.__init__(self, downloader)
799
800     def report_extraction(self, video_id):
801         """Report information extraction."""
802         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
803
804     def _real_extract(self, url):
805         # Extract id and simplified title from URL
806         mobj = re.match(self._VALID_URL, url)
807         if mobj is None:
808             self._downloader.report_error(u'invalid URL: %s' % url)
809             return
810
811         video_id = mobj.group(1).split('_')[0].split('?')[0]
812
813         video_extension = 'mp4'
814
815         # Retrieve video webpage to extract further information
816         request = compat_urllib_request.Request(url)
817         request.add_header('Cookie', 'family_filter=off')
818         webpage = self._download_webpage(request, video_id)
819
820         # Extract URL, uploader and title from webpage
821         self.report_extraction(video_id)
822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
823         if mobj is None:
824             self._downloader.report_error(u'unable to extract media URL')
825             return
826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
827
828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
829             if key in flashvars:
830                 max_quality = key
831                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
832                 break
833         else:
834             self._downloader.report_error(u'unable to extract video URL')
835             return
836
837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
838         if mobj is None:
839             self._downloader.report_error(u'unable to extract video URL')
840             return
841
842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
843
844         # TODO: support choosing qualities
845
846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
847         if mobj is None:
848             self._downloader.report_error(u'unable to extract title')
849             return
850         video_title = unescapeHTML(mobj.group('title'))
851
852         video_uploader = None
853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
854         if mobj is None:
855             # lookin for official user
856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857             if mobj_official is None:
858                 self._downloader.report_warning(u'unable to extract uploader nickname')
859             else:
860                 video_uploader = mobj_official.group(1)
861         else:
862             video_uploader = mobj.group(1)
863
864         video_upload_date = None
865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
866         if mobj is not None:
867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
868
869         return [{
870             'id':       video_id,
871             'url':      video_url,
872             'uploader': video_uploader,
873             'upload_date':  video_upload_date,
874             'title':    video_title,
875             'ext':      video_extension,
876         }]
877
878
879 class PhotobucketIE(InfoExtractor):
880     """Information extractor for photobucket.com."""
881
882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883     IE_NAME = u'photobucket'
884
885     def __init__(self, downloader=None):
886         InfoExtractor.__init__(self, downloader)
887
888     def report_download_webpage(self, video_id):
889         """Report webpage download."""
890         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
891
892     def report_extraction(self, video_id):
893         """Report information extraction."""
894         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
895
896     def _real_extract(self, url):
897         # Extract id from URL
898         mobj = re.match(self._VALID_URL, url)
899         if mobj is None:
900             self._downloader.report_error(u'Invalid URL: %s' % url)
901             return
902
903         video_id = mobj.group(1)
904
905         video_extension = 'flv'
906
907         # Retrieve video webpage to extract further information
908         request = compat_urllib_request.Request(url)
909         try:
910             self.report_download_webpage(video_id)
911             webpage = compat_urllib_request.urlopen(request).read()
912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
914             return
915
916         # Extract URL, uploader, and title from webpage
917         self.report_extraction(video_id)
918         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
919         if mobj is None:
920             self._downloader.report_error(u'unable to extract media URL')
921             return
922         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
923
924         video_url = mediaURL
925
926         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
927         if mobj is None:
928             self._downloader.report_error(u'unable to extract title')
929             return
930         video_title = mobj.group(1).decode('utf-8')
931
932         video_uploader = mobj.group(2).decode('utf-8')
933
934         return [{
935             'id':       video_id.decode('utf-8'),
936             'url':      video_url.decode('utf-8'),
937             'uploader': video_uploader,
938             'upload_date':  None,
939             'title':    video_title,
940             'ext':      video_extension.decode('utf-8'),
941         }]
942
943
944 class YahooIE(InfoExtractor):
945     """Information extractor for video.yahoo.com."""
946
947     _WORKING = False
948     # _VALID_URL matches all Yahoo! Video URLs
949     # _VPAGE_URL matches only the extractable '/watch/' URLs
950     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952     IE_NAME = u'video.yahoo'
953
954     def __init__(self, downloader=None):
955         InfoExtractor.__init__(self, downloader)
956
957     def report_download_webpage(self, video_id):
958         """Report webpage download."""
959         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
960
961     def report_extraction(self, video_id):
962         """Report information extraction."""
963         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
964
965     def _real_extract(self, url, new_video=True):
966         # Extract ID from URL
967         mobj = re.match(self._VALID_URL, url)
968         if mobj is None:
969             self._downloader.report_error(u'Invalid URL: %s' % url)
970             return
971
972         video_id = mobj.group(2)
973         video_extension = 'flv'
974
975         # Rewrite valid but non-extractable URLs as
976         # extractable English language /watch/ URLs
977         if re.match(self._VPAGE_URL, url) is None:
978             request = compat_urllib_request.Request(url)
979             try:
980                 webpage = compat_urllib_request.urlopen(request).read()
981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
983                 return
984
985             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
986             if mobj is None:
987                 self._downloader.report_error(u'Unable to extract id field')
988                 return
989             yahoo_id = mobj.group(1)
990
991             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
992             if mobj is None:
993                 self._downloader.report_error(u'Unable to extract vid field')
994                 return
995             yahoo_vid = mobj.group(1)
996
997             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998             return self._real_extract(url, new_video=False)
999
1000         # Retrieve video webpage to extract further information
1001         request = compat_urllib_request.Request(url)
1002         try:
1003             self.report_download_webpage(video_id)
1004             webpage = compat_urllib_request.urlopen(request).read()
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Extract uploader and title from webpage
1010         self.report_extraction(video_id)
1011         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1012         if mobj is None:
1013             self._downloader.report_error(u'unable to extract video title')
1014             return
1015         video_title = mobj.group(1).decode('utf-8')
1016
1017         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video uploader')
1020             return
1021         video_uploader = mobj.group(1).decode('utf-8')
1022
1023         # Extract video thumbnail
1024         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video thumbnail')
1027             return
1028         video_thumbnail = mobj.group(1).decode('utf-8')
1029
1030         # Extract video description
1031         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video description')
1034             return
1035         video_description = mobj.group(1).decode('utf-8')
1036         if not video_description:
1037             video_description = 'No description available.'
1038
1039         # Extract video height and width
1040         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1041         if mobj is None:
1042             self._downloader.report_error(u'unable to extract video height')
1043             return
1044         yv_video_height = mobj.group(1)
1045
1046         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1047         if mobj is None:
1048             self._downloader.report_error(u'unable to extract video width')
1049             return
1050         yv_video_width = mobj.group(1)
1051
1052         # Retrieve video playlist to extract media URL
1053         # I'm not completely sure what all these options are, but we
1054         # seem to need most of them, otherwise the server sends a 401.
1055         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1056         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1057         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Extract media URL from playlist XML
1068         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1069         if mobj is None:
1070             self._downloader.report_error(u'Unable to extract media URL')
1071             return
1072         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073         video_url = unescapeHTML(video_url)
1074
1075         return [{
1076             'id':       video_id.decode('utf-8'),
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'upload_date':  None,
1080             'title':    video_title,
1081             'ext':      video_extension.decode('utf-8'),
1082             'thumbnail':    video_thumbnail.decode('utf-8'),
1083             'description':  video_description,
1084         }]
1085
1086
1087 class VimeoIE(InfoExtractor):
1088     """Information extractor for vimeo.com."""
1089
1090     # _VALID_URL matches Vimeo URLs
1091     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092     IE_NAME = u'vimeo'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1104
1105     def _real_extract(self, url, new_video=True):
1106         # Extract ID from URL
1107         mobj = re.match(self._VALID_URL, url)
1108         if mobj is None:
1109             self._downloader.report_error(u'Invalid URL: %s' % url)
1110             return
1111
1112         video_id = mobj.group('id')
1113         if not mobj.group('proto'):
1114             url = 'https://' + url
1115         if mobj.group('direct_link'):
1116             url = 'https://vimeo.com/' + video_id
1117
1118         # Retrieve video webpage to extract further information
1119         request = compat_urllib_request.Request(url, None, std_headers)
1120         try:
1121             self.report_download_webpage(video_id)
1122             webpage_bytes = compat_urllib_request.urlopen(request).read()
1123             webpage = webpage_bytes.decode('utf-8')
1124         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1126             return
1127
1128         # Now we begin extracting as much information as we can from what we
1129         # retrieved. First we extract the information common to all extractors,
1130         # and latter we extract those that are Vimeo specific.
1131         self.report_extraction(video_id)
1132
1133         # Extract the config JSON
1134         try:
1135             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136             config = json.loads(config)
1137         except:
1138             self._downloader.report_error(u'unable to extract info section')
1139             return
1140
1141         # Extract title
1142         video_title = config["video"]["title"]
1143
1144         # Extract uploader and uploader_id
1145         video_uploader = config["video"]["owner"]["name"]
1146         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1147
1148         # Extract video thumbnail
1149         video_thumbnail = config["video"]["thumbnail"]
1150
1151         # Extract video description
1152         video_description = get_element_by_attribute("itemprop", "description", webpage)
1153         if video_description: video_description = clean_html(video_description)
1154         else: video_description = ''
1155
1156         # Extract upload date
1157         video_upload_date = None
1158         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159         if mobj is not None:
1160             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1161
1162         # Vimeo specific: extract request signature and timestamp
1163         sig = config['request']['signature']
1164         timestamp = config['request']['timestamp']
1165
1166         # Vimeo specific: extract video codec and quality information
1167         # First consider quality, then codecs, then take everything
1168         # TODO bind to format param
1169         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170         files = { 'hd': [], 'sd': [], 'other': []}
1171         for codec_name, codec_extension in codecs:
1172             if codec_name in config["video"]["files"]:
1173                 if 'hd' in config["video"]["files"][codec_name]:
1174                     files['hd'].append((codec_name, codec_extension, 'hd'))
1175                 elif 'sd' in config["video"]["files"][codec_name]:
1176                     files['sd'].append((codec_name, codec_extension, 'sd'))
1177                 else:
1178                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1179
1180         for quality in ('hd', 'sd', 'other'):
1181             if len(files[quality]) > 0:
1182                 video_quality = files[quality][0][2]
1183                 video_codec = files[quality][0][0]
1184                 video_extension = files[quality][0][1]
1185                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1186                 break
1187         else:
1188             self._downloader.report_error(u'no known codec found')
1189             return
1190
1191         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1193
1194         return [{
1195             'id':       video_id,
1196             'url':      video_url,
1197             'uploader': video_uploader,
1198             'uploader_id': video_uploader_id,
1199             'upload_date':  video_upload_date,
1200             'title':    video_title,
1201             'ext':      video_extension,
1202             'thumbnail':    video_thumbnail,
1203             'description':  video_description,
1204         }]
1205
1206
1207 class ArteTvIE(InfoExtractor):
1208     """arte.tv information extractor."""
1209
1210     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211     _LIVE_URL = r'index-[0-9]+\.html$'
1212
1213     IE_NAME = u'arte.tv'
1214
1215     def __init__(self, downloader=None):
1216         InfoExtractor.__init__(self, downloader)
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1221
1222     def report_extraction(self, video_id):
1223         """Report information extraction."""
1224         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233             return
1234         except ValueError as err:
1235             self._downloader.report_error(u'Invalid URL: %s' % url)
1236             return
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             self._downloader.report_error(u'Invalid URL: %s' % url)
1246             return
1247
1248         for (i, key, err) in matchTuples:
1249             if mobj.group(i) is None:
1250                 self._downloader.trouble(err)
1251                 return
1252             else:
1253                 info[key] = mobj.group(i)
1254
1255         return info
1256
1257     def extractLiveStream(self, url):
1258         video_lang = url.split('/')[-4]
1259         info = self.grep_webpage(
1260             url,
1261             r'src="(.*?/videothek_js.*?\.js)',
1262             0,
1263             [
1264                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1265             ]
1266         )
1267         http_host = url.split('/')[2]
1268         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269         info = self.grep_webpage(
1270             next_url,
1271             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272                 '(http://.*?\.swf).*?' +
1273                 '(rtmp://.*?)\'',
1274             re.DOTALL,
1275             [
1276                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1277                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1279             ]
1280         )
1281         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1282
1283     def extractPlus7Stream(self, url):
1284         video_lang = url.split('/')[-3]
1285         info = self.grep_webpage(
1286             url,
1287             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288             0,
1289             [
1290                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1291             ]
1292         )
1293         next_url = compat_urllib_parse.unquote(info.get('url'))
1294         info = self.grep_webpage(
1295             next_url,
1296             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297             0,
1298             [
1299                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1300             ]
1301         )
1302         next_url = compat_urllib_parse.unquote(info.get('url'))
1303
1304         info = self.grep_webpage(
1305             next_url,
1306             r'<video id="(.*?)".*?>.*?' +
1307                 '<name>(.*?)</name>.*?' +
1308                 '<dateVideo>(.*?)</dateVideo>.*?' +
1309                 '<url quality="hd">(.*?)</url>',
1310             re.DOTALL,
1311             [
1312                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1313                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1315                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1316             ]
1317         )
1318
1319         return {
1320             'id':           info.get('id'),
1321             'url':          compat_urllib_parse.unquote(info.get('url')),
1322             'uploader':     u'arte.tv',
1323             'upload_date':  info.get('date'),
1324             'title':        info.get('title').decode('utf-8'),
1325             'ext':          u'mp4',
1326             'format':       u'NA',
1327             'player_url':   None,
1328         }
1329
1330     def _real_extract(self, url):
1331         video_id = url.split('/')[-1]
1332         self.report_extraction(video_id)
1333
1334         if re.search(self._LIVE_URL, video_id) is not None:
1335             self.extractLiveStream(url)
1336             return
1337         else:
1338             info = self.extractPlus7Stream(url)
1339
1340         return [info]
1341
1342
1343 class GenericIE(InfoExtractor):
1344     """Generic last-resort information extractor."""
1345
1346     _VALID_URL = r'.*'
1347     IE_NAME = u'generic'
1348
1349     def __init__(self, downloader=None):
1350         InfoExtractor.__init__(self, downloader)
1351
1352     def report_download_webpage(self, video_id):
1353         """Report webpage download."""
1354         if not self._downloader.params.get('test', False):
1355             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1357
1358     def report_extraction(self, video_id):
1359         """Report information extraction."""
1360         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1361
1362     def report_following_redirect(self, new_url):
1363         """Report information extraction."""
1364         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1365
1366     def _test_redirect(self, url):
1367         """Check if it is a redirect, like url shorteners, in case return the new url."""
1368         class HeadRequest(compat_urllib_request.Request):
1369             def get_method(self):
1370                 return "HEAD"
1371
1372         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1373             """
1374             Subclass the HTTPRedirectHandler to make it use our
1375             HeadRequest also on the redirected URL
1376             """
1377             def redirect_request(self, req, fp, code, msg, headers, newurl):
1378                 if code in (301, 302, 303, 307):
1379                     newurl = newurl.replace(' ', '%20')
1380                     newheaders = dict((k,v) for k,v in req.headers.items()
1381                                       if k.lower() not in ("content-length", "content-type"))
1382                     return HeadRequest(newurl,
1383                                        headers=newheaders,
1384                                        origin_req_host=req.get_origin_req_host(),
1385                                        unverifiable=True)
1386                 else:
1387                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1388
1389         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1390             """
1391             Fallback to GET if HEAD is not allowed (405 HTTP error)
1392             """
1393             def http_error_405(self, req, fp, code, msg, headers):
1394                 fp.read()
1395                 fp.close()
1396
1397                 newheaders = dict((k,v) for k,v in req.headers.items()
1398                                   if k.lower() not in ("content-length", "content-type"))
1399                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1400                                                  headers=newheaders,
1401                                                  origin_req_host=req.get_origin_req_host(),
1402                                                  unverifiable=True))
1403
1404         # Build our opener
1405         opener = compat_urllib_request.OpenerDirector()
1406         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407                         HTTPMethodFallback, HEADRedirectHandler,
1408                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409             opener.add_handler(handler())
1410
1411         response = opener.open(HeadRequest(url))
1412         new_url = response.geturl()
1413
1414         if url == new_url:
1415             return False
1416
1417         self.report_following_redirect(new_url)
1418         return new_url
1419
1420     def _real_extract(self, url):
1421         new_url = self._test_redirect(url)
1422         if new_url: return [self.url_result(new_url)]
1423
1424         video_id = url.split('/')[-1]
1425         try:
1426             webpage = self._download_webpage(url, video_id)
1427         except ValueError as err:
1428             # since this is the last-resort InfoExtractor, if
1429             # this error is thrown, it'll be thrown here
1430             self._downloader.report_error(u'Invalid URL: %s' % url)
1431             return
1432
1433         self.report_extraction(video_id)
1434         # Start with something easy: JW Player in SWFObject
1435         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1436         if mobj is None:
1437             # Broaden the search a little bit
1438             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1439         if mobj is None:
1440             # Broaden the search a little bit: JWPlayer JS loader
1441             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'Invalid URL: %s' % url)
1444             return
1445
1446         # It's possible that one of the regexes
1447         # matched, but returned an empty group:
1448         if mobj.group(1) is None:
1449             self._downloader.report_error(u'Invalid URL: %s' % url)
1450             return
1451
1452         video_url = compat_urllib_parse.unquote(mobj.group(1))
1453         video_id = os.path.basename(video_url)
1454
1455         # here's a fun little line of code for you:
1456         video_extension = os.path.splitext(video_id)[1][1:]
1457         video_id = os.path.splitext(video_id)[0]
1458
1459         # it's tempting to parse this further, but you would
1460         # have to take into account all the variations like
1461         #   Video Title - Site Name
1462         #   Site Name | Video Title
1463         #   Video Title - Tagline | Site Name
1464         # and so on and so forth; it's just not practical
1465         mobj = re.search(r'<title>(.*)</title>', webpage)
1466         if mobj is None:
1467             self._downloader.report_error(u'unable to extract title')
1468             return
1469         video_title = mobj.group(1)
1470
1471         # video uploader is domain name
1472         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1473         if mobj is None:
1474             self._downloader.report_error(u'unable to extract title')
1475             return
1476         video_uploader = mobj.group(1)
1477
1478         return [{
1479             'id':       video_id,
1480             'url':      video_url,
1481             'uploader': video_uploader,
1482             'upload_date':  None,
1483             'title':    video_title,
1484             'ext':      video_extension,
1485         }]
1486
1487
1488 class YoutubeSearchIE(InfoExtractor):
1489     """Information Extractor for YouTube search queries."""
1490     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492     _max_youtube_results = 1000
1493     IE_NAME = u'youtube:search'
1494
1495     def __init__(self, downloader=None):
1496         InfoExtractor.__init__(self, downloader)
1497
1498     def report_download_page(self, query, pagenum):
1499         """Report attempt to download search page with given number."""
1500         query = query.decode(preferredencoding())
1501         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1502
1503     def _real_extract(self, query):
1504         mobj = re.match(self._VALID_URL, query)
1505         if mobj is None:
1506             self._downloader.report_error(u'invalid search query "%s"' % query)
1507             return
1508
1509         prefix, query = query.split(':')
1510         prefix = prefix[8:]
1511         query = query.encode('utf-8')
1512         if prefix == '':
1513             self._download_n_results(query, 1)
1514             return
1515         elif prefix == 'all':
1516             self._download_n_results(query, self._max_youtube_results)
1517             return
1518         else:
1519             try:
1520                 n = int(prefix)
1521                 if n <= 0:
1522                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1523                     return
1524                 elif n > self._max_youtube_results:
1525                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526                     n = self._max_youtube_results
1527                 self._download_n_results(query, n)
1528                 return
1529             except ValueError: # parsing prefix as integer fails
1530                 self._download_n_results(query, 1)
1531                 return
1532
1533     def _download_n_results(self, query, n):
1534         """Downloads a specified number of results for a query"""
1535
1536         video_ids = []
1537         pagenum = 0
1538         limit = n
1539
1540         while (50 * pagenum) < limit:
1541             self.report_download_page(query, pagenum+1)
1542             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1548                 return
1549             api_response = json.loads(data)['data']
1550
1551             if not 'items' in api_response:
1552                 self._downloader.trouble(u'[youtube] No video results')
1553                 return
1554
1555             new_ids = list(video['id'] for video in api_response['items'])
1556             video_ids += new_ids
1557
1558             limit = min(n, api_response['totalItems'])
1559             pagenum += 1
1560
1561         if len(video_ids) > n:
1562             video_ids = video_ids[:n]
1563         for id in video_ids:
1564             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565         return
1566
1567
1568 class GoogleSearchIE(InfoExtractor):
1569     """Information Extractor for Google Video search queries."""
1570     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574     _max_google_results = 1000
1575     IE_NAME = u'video.google:search'
1576
1577     def __init__(self, downloader=None):
1578         InfoExtractor.__init__(self, downloader)
1579
1580     def report_download_page(self, query, pagenum):
1581         """Report attempt to download playlist page with given number."""
1582         query = query.decode(preferredencoding())
1583         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1584
1585     def _real_extract(self, query):
1586         mobj = re.match(self._VALID_URL, query)
1587         if mobj is None:
1588             self._downloader.report_error(u'invalid search query "%s"' % query)
1589             return
1590
1591         prefix, query = query.split(':')
1592         prefix = prefix[8:]
1593         query = query.encode('utf-8')
1594         if prefix == '':
1595             self._download_n_results(query, 1)
1596             return
1597         elif prefix == 'all':
1598             self._download_n_results(query, self._max_google_results)
1599             return
1600         else:
1601             try:
1602                 n = int(prefix)
1603                 if n <= 0:
1604                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605                     return
1606                 elif n > self._max_google_results:
1607                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608                     n = self._max_google_results
1609                 self._download_n_results(query, n)
1610                 return
1611             except ValueError: # parsing prefix as integer fails
1612                 self._download_n_results(query, 1)
1613                 return
1614
1615     def _download_n_results(self, query, n):
1616         """Downloads a specified number of results for a query"""
1617
1618         video_ids = []
1619         pagenum = 0
1620
1621         while True:
1622             self.report_download_page(query, pagenum)
1623             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624             request = compat_urllib_request.Request(result_url)
1625             try:
1626                 page = compat_urllib_request.urlopen(request).read()
1627             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1629                 return
1630
1631             # Extract video identifiers
1632             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633                 video_id = mobj.group(1)
1634                 if video_id not in video_ids:
1635                     video_ids.append(video_id)
1636                     if len(video_ids) == n:
1637                         # Specified n videos reached
1638                         for id in video_ids:
1639                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1640                         return
1641
1642             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643                 for id in video_ids:
1644                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                 return
1646
1647             pagenum = pagenum + 1
1648
1649
1650 class YahooSearchIE(InfoExtractor):
1651     """Information Extractor for Yahoo! Video search queries."""
1652
1653     _WORKING = False
1654     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657     _MORE_PAGES_INDICATOR = r'\s*Next'
1658     _max_yahoo_results = 1000
1659     IE_NAME = u'video.yahoo:search'
1660
1661     def __init__(self, downloader=None):
1662         InfoExtractor.__init__(self, downloader)
1663
1664     def report_download_page(self, query, pagenum):
1665         """Report attempt to download playlist page with given number."""
1666         query = query.decode(preferredencoding())
1667         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1668
1669     def _real_extract(self, query):
1670         mobj = re.match(self._VALID_URL, query)
1671         if mobj is None:
1672             self._downloader.report_error(u'invalid search query "%s"' % query)
1673             return
1674
1675         prefix, query = query.split(':')
1676         prefix = prefix[8:]
1677         query = query.encode('utf-8')
1678         if prefix == '':
1679             self._download_n_results(query, 1)
1680             return
1681         elif prefix == 'all':
1682             self._download_n_results(query, self._max_yahoo_results)
1683             return
1684         else:
1685             try:
1686                 n = int(prefix)
1687                 if n <= 0:
1688                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1689                     return
1690                 elif n > self._max_yahoo_results:
1691                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692                     n = self._max_yahoo_results
1693                 self._download_n_results(query, n)
1694                 return
1695             except ValueError: # parsing prefix as integer fails
1696                 self._download_n_results(query, 1)
1697                 return
1698
1699     def _download_n_results(self, query, n):
1700         """Downloads a specified number of results for a query"""
1701
1702         video_ids = []
1703         already_seen = set()
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(query, pagenum)
1708             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709             request = compat_urllib_request.Request(result_url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read()
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 video_id = mobj.group(1)
1719                 if video_id not in already_seen:
1720                     video_ids.append(video_id)
1721                     already_seen.add(video_id)
1722                     if len(video_ids) == n:
1723                         # Specified n videos reached
1724                         for id in video_ids:
1725                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1726                         return
1727
1728             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729                 for id in video_ids:
1730                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                 return
1732
1733             pagenum = pagenum + 1
1734
1735
1736 class YoutubePlaylistIE(InfoExtractor):
1737     """Information Extractor for YouTube playlists."""
1738
1739     _VALID_URL = r"""(?:
1740                         (?:https?://)?
1741                         (?:\w+\.)?
1742                         youtube\.com/
1743                         (?:
1744                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745                            \? (?:.*?&)*? (?:p|a|list)=
1746                         |  p/
1747                         )
1748                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1749                         .*
1750                      |
1751                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1752                      )"""
1753     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1754     _MAX_RESULTS = 50
1755     IE_NAME = u'youtube:playlist'
1756
1757     def __init__(self, downloader=None):
1758         InfoExtractor.__init__(self, downloader)
1759
1760     @classmethod
1761     def suitable(cls, url):
1762         """Receives a URL and returns True if suitable for this IE."""
1763         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1764
1765     def report_download_page(self, playlist_id, pagenum):
1766         """Report attempt to download playlist page with given number."""
1767         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1768
1769     def _real_extract(self, url):
1770         # Extract playlist id
1771         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1772         if mobj is None:
1773             self._downloader.report_error(u'invalid url: %s' % url)
1774             return
1775
1776         # Download playlist videos from API
1777         playlist_id = mobj.group(1) or mobj.group(2)
1778         page_num = 1
1779         videos = []
1780
1781         while True:
1782             self.report_download_page(playlist_id, page_num)
1783
1784             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1785             try:
1786                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             try:
1792                 response = json.loads(page)
1793             except ValueError as err:
1794                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1795                 return
1796
1797             if not 'feed' in response or not 'entry' in response['feed']:
1798                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1799                 return
1800             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1801                         for entry in response['feed']['entry']
1802                         if 'content' in entry ]
1803
1804             if len(response['feed']['entry']) < self._MAX_RESULTS:
1805                 break
1806             page_num += 1
1807
1808         videos = [v[1] for v in sorted(videos)]
1809
1810         url_results = [self.url_result(url) for url in videos]
1811         return [self.playlist_result(url_results, playlist_id)]
1812
1813
1814 class YoutubeChannelIE(InfoExtractor):
1815     """Information Extractor for YouTube channels."""
1816
1817     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1818     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1819     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1820     IE_NAME = u'youtube:channel'
1821
1822     def report_download_page(self, channel_id, pagenum):
1823         """Report attempt to download channel page with given number."""
1824         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1825
1826     def _real_extract(self, url):
1827         # Extract channel id
1828         mobj = re.match(self._VALID_URL, url)
1829         if mobj is None:
1830             self._downloader.report_error(u'invalid url: %s' % url)
1831             return
1832
1833         # Download channel pages
1834         channel_id = mobj.group(1)
1835         video_ids = []
1836         pagenum = 1
1837
1838         while True:
1839             self.report_download_page(channel_id, pagenum)
1840             url = self._TEMPLATE_URL % (channel_id, pagenum)
1841             request = compat_urllib_request.Request(url)
1842             try:
1843                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1844             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1846                 return
1847
1848             # Extract video identifiers
1849             ids_in_page = []
1850             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1851                 if mobj.group(1) not in ids_in_page:
1852                     ids_in_page.append(mobj.group(1))
1853             video_ids.extend(ids_in_page)
1854
1855             if self._MORE_PAGES_INDICATOR not in page:
1856                 break
1857             pagenum = pagenum + 1
1858
1859         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1860
1861         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1862         url_entries = [self.url_result(url) for url in urls]
1863         return [self.playlist_result(url_entries, channel_id)]
1864
1865
1866 class YoutubeUserIE(InfoExtractor):
1867     """Information Extractor for YouTube users."""
1868
1869     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1870     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1871     _GDATA_PAGE_SIZE = 50
1872     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1873     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1874     IE_NAME = u'youtube:user'
1875
1876     def __init__(self, downloader=None):
1877         InfoExtractor.__init__(self, downloader)
1878
1879     def report_download_page(self, username, start_index):
1880         """Report attempt to download user page."""
1881         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1882                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1883
1884     def _real_extract(self, url):
1885         # Extract username
1886         mobj = re.match(self._VALID_URL, url)
1887         if mobj is None:
1888             self._downloader.report_error(u'invalid url: %s' % url)
1889             return
1890
1891         username = mobj.group(1)
1892
1893         # Download video ids using YouTube Data API. Result size per
1894         # query is limited (currently to 50 videos) so we need to query
1895         # page by page until there are no video ids - it means we got
1896         # all of them.
1897
1898         video_ids = []
1899         pagenum = 0
1900
1901         while True:
1902             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1903             self.report_download_page(username, start_index)
1904
1905             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1906
1907             try:
1908                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1911                 return
1912
1913             # Extract video identifiers
1914             ids_in_page = []
1915
1916             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917                 if mobj.group(1) not in ids_in_page:
1918                     ids_in_page.append(mobj.group(1))
1919
1920             video_ids.extend(ids_in_page)
1921
1922             # A little optimization - if current page is not
1923             # "full", ie. does not contain PAGE_SIZE video ids then
1924             # we can assume that this page is the last one - there
1925             # are no more ids on further pages - no need to query
1926             # again.
1927
1928             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1929                 break
1930
1931             pagenum += 1
1932
1933         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1934         url_results = [self.url_result(url) for url in urls]
1935         return [self.playlist_result(url_results, playlist_title = username)]
1936
1937
1938 class BlipTVUserIE(InfoExtractor):
1939     """Information Extractor for blip.tv users."""
1940
1941     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1942     _PAGE_SIZE = 12
1943     IE_NAME = u'blip.tv:user'
1944
1945     def __init__(self, downloader=None):
1946         InfoExtractor.__init__(self, downloader)
1947
1948     def report_download_page(self, username, pagenum):
1949         """Report attempt to download user page."""
1950         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1951                 (self.IE_NAME, username, pagenum))
1952
1953     def _real_extract(self, url):
1954         # Extract username
1955         mobj = re.match(self._VALID_URL, url)
1956         if mobj is None:
1957             self._downloader.report_error(u'invalid url: %s' % url)
1958             return
1959
1960         username = mobj.group(1)
1961
1962         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1963
1964         request = compat_urllib_request.Request(url)
1965
1966         try:
1967             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968             mobj = re.search(r'data-users-id="([^"]+)"', page)
1969             page_base = page_base % mobj.group(1)
1970         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1971             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1972             return
1973
1974
1975         # Download video ids using BlipTV Ajax calls. Result size per
1976         # query is limited (currently to 12 videos) so we need to query
1977         # page by page until there are no video ids - it means we got
1978         # all of them.
1979
1980         video_ids = []
1981         pagenum = 1
1982
1983         while True:
1984             self.report_download_page(username, pagenum)
1985             url = page_base + "&page=" + str(pagenum)
1986             request = compat_urllib_request.Request( url )
1987             try:
1988                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1989             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1991                 return
1992
1993             # Extract video identifiers
1994             ids_in_page = []
1995
1996             for mobj in re.finditer(r'href="/([^"]+)"', page):
1997                 if mobj.group(1) not in ids_in_page:
1998                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1999
2000             video_ids.extend(ids_in_page)
2001
2002             # A little optimization - if current page is not
2003             # "full", ie. does not contain PAGE_SIZE video ids then
2004             # we can assume that this page is the last one - there
2005             # are no more ids on further pages - no need to query
2006             # again.
2007
2008             if len(ids_in_page) < self._PAGE_SIZE:
2009                 break
2010
2011             pagenum += 1
2012
2013         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2014                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2015
2016         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2017         url_entries = [self.url_result(url) for url in urls]
2018         return [self.playlist_result(url_entries, playlist_title = username)]
2019
2020
2021 class DepositFilesIE(InfoExtractor):
2022     """Information extractor for depositfiles.com"""
2023
2024     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2025
2026     def report_download_webpage(self, file_id):
2027         """Report webpage download."""
2028         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2029
2030     def report_extraction(self, file_id):
2031         """Report information extraction."""
2032         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2033
2034     def _real_extract(self, url):
2035         file_id = url.split('/')[-1]
2036         # Rebuild url in english locale
2037         url = 'http://depositfiles.com/en/files/' + file_id
2038
2039         # Retrieve file webpage with 'Free download' button pressed
2040         free_download_indication = { 'gateway_result' : '1' }
2041         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2042         try:
2043             self.report_download_webpage(file_id)
2044             webpage = compat_urllib_request.urlopen(request).read()
2045         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2046             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2047             return
2048
2049         # Search for the real file URL
2050         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2051         if (mobj is None) or (mobj.group(1) is None):
2052             # Try to figure out reason of the error.
2053             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2054             if (mobj is not None) and (mobj.group(1) is not None):
2055                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2056                 self._downloader.report_error(u'%s' % restriction_message)
2057             else:
2058                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2059             return
2060
2061         file_url = mobj.group(1)
2062         file_extension = os.path.splitext(file_url)[1][1:]
2063
2064         # Search for file title
2065         mobj = re.search(r'<b title="(.*?)">', webpage)
2066         if mobj is None:
2067             self._downloader.report_error(u'unable to extract title')
2068             return
2069         file_title = mobj.group(1).decode('utf-8')
2070
2071         return [{
2072             'id':       file_id.decode('utf-8'),
2073             'url':      file_url.decode('utf-8'),
2074             'uploader': None,
2075             'upload_date':  None,
2076             'title':    file_title,
2077             'ext':      file_extension.decode('utf-8'),
2078         }]
2079
2080
2081 class FacebookIE(InfoExtractor):
2082     """Information Extractor for Facebook"""
2083
2084     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2085     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2086     _NETRC_MACHINE = 'facebook'
2087     IE_NAME = u'facebook'
2088
2089     def report_login(self):
2090         """Report attempt to log in."""
2091         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2092
2093     def _real_initialize(self):
2094         if self._downloader is None:
2095             return
2096
2097         useremail = None
2098         password = None
2099         downloader_params = self._downloader.params
2100
2101         # Attempt to use provided username and password or .netrc data
2102         if downloader_params.get('username', None) is not None:
2103             useremail = downloader_params['username']
2104             password = downloader_params['password']
2105         elif downloader_params.get('usenetrc', False):
2106             try:
2107                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2108                 if info is not None:
2109                     useremail = info[0]
2110                     password = info[2]
2111                 else:
2112                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2113             except (IOError, netrc.NetrcParseError) as err:
2114                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2115                 return
2116
2117         if useremail is None:
2118             return
2119
2120         # Log in
2121         login_form = {
2122             'email': useremail,
2123             'pass': password,
2124             'login': 'Log+In'
2125             }
2126         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2127         try:
2128             self.report_login()
2129             login_results = compat_urllib_request.urlopen(request).read()
2130             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2131                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2132                 return
2133         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2134             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2135             return
2136
2137     def _real_extract(self, url):
2138         mobj = re.match(self._VALID_URL, url)
2139         if mobj is None:
2140             self._downloader.report_error(u'invalid URL: %s' % url)
2141             return
2142         video_id = mobj.group('ID')
2143
2144         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2145         webpage = self._download_webpage(url, video_id)
2146
2147         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2148         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2149         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2150         if not m:
2151             raise ExtractorError(u'Cannot parse data')
2152         data = dict(json.loads(m.group(1)))
2153         params_raw = compat_urllib_parse.unquote(data['params'])
2154         params = json.loads(params_raw)
2155         video_url = params['hd_src']
2156         if not video_url:
2157             video_url = params['sd_src']
2158         if not video_url:
2159             raise ExtractorError(u'Cannot find video URL')
2160         video_duration = int(params['video_duration'])
2161
2162         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2163         if not m:
2164             raise ExtractorError(u'Cannot find title in webpage')
2165         video_title = unescapeHTML(m.group(1))
2166
2167         info = {
2168             'id': video_id,
2169             'title': video_title,
2170             'url': video_url,
2171             'ext': 'mp4',
2172             'duration': video_duration,
2173             'thumbnail': params['thumbnail_src'],
2174         }
2175         return [info]
2176
2177
2178 class BlipTVIE(InfoExtractor):
2179     """Information extractor for blip.tv"""
2180
2181     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2182     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2183     IE_NAME = u'blip.tv'
2184
2185     def report_extraction(self, file_id):
2186         """Report information extraction."""
2187         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2188
2189     def report_direct_download(self, title):
2190         """Report information extraction."""
2191         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2192
2193     def _real_extract(self, url):
2194         mobj = re.match(self._VALID_URL, url)
2195         if mobj is None:
2196             self._downloader.report_error(u'invalid URL: %s' % url)
2197             return
2198
2199         urlp = compat_urllib_parse_urlparse(url)
2200         if urlp.path.startswith('/play/'):
2201             request = compat_urllib_request.Request(url)
2202             response = compat_urllib_request.urlopen(request)
2203             redirecturl = response.geturl()
2204             rurlp = compat_urllib_parse_urlparse(redirecturl)
2205             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2206             url = 'http://blip.tv/a/a-' + file_id
2207             return self._real_extract(url)
2208
2209
2210         if '?' in url:
2211             cchar = '&'
2212         else:
2213             cchar = '?'
2214         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215         request = compat_urllib_request.Request(json_url)
2216         request.add_header('User-Agent', 'iTunes/10.6.1')
2217         self.report_extraction(mobj.group(1))
2218         info = None
2219         try:
2220             urlh = compat_urllib_request.urlopen(request)
2221             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222                 basename = url.split('/')[-1]
2223                 title,ext = os.path.splitext(basename)
2224                 title = title.decode('UTF-8')
2225                 ext = ext.replace('.', '')
2226                 self.report_direct_download(title)
2227                 info = {
2228                     'id': title,
2229                     'url': url,
2230                     'uploader': None,
2231                     'upload_date': None,
2232                     'title': title,
2233                     'ext': ext,
2234                     'urlhandle': urlh
2235                 }
2236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238         if info is None: # Regular URL
2239             try:
2240                 json_code_bytes = urlh.read()
2241                 json_code = json_code_bytes.decode('utf-8')
2242             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2244                 return
2245
2246             try:
2247                 json_data = json.loads(json_code)
2248                 if 'Post' in json_data:
2249                     data = json_data['Post']
2250                 else:
2251                     data = json_data
2252
2253                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254                 video_url = data['media']['url']
2255                 umobj = re.match(self._URL_EXT, video_url)
2256                 if umobj is None:
2257                     raise ValueError('Can not determine filename extension')
2258                 ext = umobj.group(1)
2259
2260                 info = {
2261                     'id': data['item_id'],
2262                     'url': video_url,
2263                     'uploader': data['display_name'],
2264                     'upload_date': upload_date,
2265                     'title': data['title'],
2266                     'ext': ext,
2267                     'format': data['media']['mimeType'],
2268                     'thumbnail': data['thumbnailUrl'],
2269                     'description': data['description'],
2270                     'player_url': data['embedUrl'],
2271                     'user_agent': 'iTunes/10.6.1',
2272                 }
2273             except (ValueError,KeyError) as err:
2274                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2275                 return
2276
2277         return [info]
2278
2279
2280 class MyVideoIE(InfoExtractor):
2281     """Information Extractor for myvideo.de."""
2282
2283     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284     IE_NAME = u'myvideo'
2285
2286     def __init__(self, downloader=None):
2287         InfoExtractor.__init__(self, downloader)
2288
2289     def report_extraction(self, video_id):
2290         """Report information extraction."""
2291         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2292
2293     def _real_extract(self,url):
2294         mobj = re.match(self._VALID_URL, url)
2295         if mobj is None:
2296             self._download.report_error(u'invalid URL: %s' % url)
2297             return
2298
2299         video_id = mobj.group(1)
2300
2301         # Get video webpage
2302         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2303         webpage = self._download_webpage(webpage_url, video_id)
2304
2305         self.report_extraction(video_id)
2306         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2307                  webpage)
2308         if mobj is None:
2309             self._downloader.report_error(u'unable to extract media URL')
2310             return
2311         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2312
2313         mobj = re.search('<title>([^<]+)</title>', webpage)
2314         if mobj is None:
2315             self._downloader.report_error(u'unable to extract title')
2316             return
2317
2318         video_title = mobj.group(1)
2319
2320         return [{
2321             'id':       video_id,
2322             'url':      video_url,
2323             'uploader': None,
2324             'upload_date':  None,
2325             'title':    video_title,
2326             'ext':      u'flv',
2327         }]
2328
2329 class ComedyCentralIE(InfoExtractor):
2330     """Information extractor for The Daily Show and Colbert Report """
2331
2332     # urls can be abbreviations like :thedailyshow or :colbert
2333     # urls for episodes like:
2334     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338                       |(https?://)?(www\.)?
2339                           (?P<showname>thedailyshow|colbertnation)\.com/
2340                          (full-episodes/(?P<episode>.*)|
2341                           (?P<clip>
2342                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2344                      $"""
2345
2346     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2347
2348     _video_extensions = {
2349         '3500': 'mp4',
2350         '2200': 'mp4',
2351         '1700': 'mp4',
2352         '1200': 'mp4',
2353         '750': 'mp4',
2354         '400': 'mp4',
2355     }
2356     _video_dimensions = {
2357         '3500': '1280x720',
2358         '2200': '960x540',
2359         '1700': '768x432',
2360         '1200': '640x360',
2361         '750': '512x288',
2362         '400': '384x216',
2363     }
2364
2365     @classmethod
2366     def suitable(cls, url):
2367         """Receives a URL and returns True if suitable for this IE."""
2368         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2369
2370     def report_extraction(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2372
2373     def report_config_download(self, episode_id, media_id):
2374         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2375
2376     def report_index_download(self, episode_id):
2377         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2378
2379     def _print_formats(self, formats):
2380         print('Available formats:')
2381         for x in formats:
2382             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2383
2384
2385     def _real_extract(self, url):
2386         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387         if mobj is None:
2388             self._downloader.report_error(u'invalid URL: %s' % url)
2389             return
2390
2391         if mobj.group('shortname'):
2392             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2393                 url = u'http://www.thedailyshow.com/full-episodes/'
2394             else:
2395                 url = u'http://www.colbertnation.com/full-episodes/'
2396             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2397             assert mobj is not None
2398
2399         if mobj.group('clip'):
2400             if mobj.group('showname') == 'thedailyshow':
2401                 epTitle = mobj.group('tdstitle')
2402             else:
2403                 epTitle = mobj.group('cntitle')
2404             dlNewest = False
2405         else:
2406             dlNewest = not mobj.group('episode')
2407             if dlNewest:
2408                 epTitle = mobj.group('showname')
2409             else:
2410                 epTitle = mobj.group('episode')
2411
2412         req = compat_urllib_request.Request(url)
2413         self.report_extraction(epTitle)
2414         try:
2415             htmlHandle = compat_urllib_request.urlopen(req)
2416             html = htmlHandle.read()
2417             webpage = html.decode('utf-8')
2418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2420             return
2421         if dlNewest:
2422             url = htmlHandle.geturl()
2423             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2424             if mobj is None:
2425                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2426                 return
2427             if mobj.group('episode') == '':
2428                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2429                 return
2430             epTitle = mobj.group('episode')
2431
2432         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2433
2434         if len(mMovieParams) == 0:
2435             # The Colbert Report embeds the information in a without
2436             # a URL prefix; so extract the alternate reference
2437             # and then add the URL prefix manually.
2438
2439             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2440             if len(altMovieParams) == 0:
2441                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2442                 return
2443             else:
2444                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2445
2446         uri = mMovieParams[0][1]
2447         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2448         self.report_index_download(epTitle)
2449         try:
2450             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2453             return
2454
2455         results = []
2456
2457         idoc = xml.etree.ElementTree.fromstring(indexXml)
2458         itemEls = idoc.findall('.//item')
2459         for partNum,itemEl in enumerate(itemEls):
2460             mediaId = itemEl.findall('./guid')[0].text
2461             shortMediaId = mediaId.split(':')[-1]
2462             showId = mediaId.split(':')[-2].replace('.com', '')
2463             officialTitle = itemEl.findall('./title')[0].text
2464             officialDate = itemEl.findall('./pubDate')[0].text
2465
2466             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2467                         compat_urllib_parse.urlencode({'uri': mediaId}))
2468             configReq = compat_urllib_request.Request(configUrl)
2469             self.report_config_download(epTitle, shortMediaId)
2470             try:
2471                 configXml = compat_urllib_request.urlopen(configReq).read()
2472             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2474                 return
2475
2476             cdoc = xml.etree.ElementTree.fromstring(configXml)
2477             turls = []
2478             for rendition in cdoc.findall('.//rendition'):
2479                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2480                 turls.append(finfo)
2481
2482             if len(turls) == 0:
2483                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2484                 continue
2485
2486             if self._downloader.params.get('listformats', None):
2487                 self._print_formats([i[0] for i in turls])
2488                 return
2489
2490             # For now, just pick the highest bitrate
2491             format,rtmp_video_url = turls[-1]
2492
2493             # Get the format arg from the arg stream
2494             req_format = self._downloader.params.get('format', None)
2495
2496             # Select format if we can find one
2497             for f,v in turls:
2498                 if f == req_format:
2499                     format, rtmp_video_url = f, v
2500                     break
2501
2502             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2503             if not m:
2504                 raise ExtractorError(u'Cannot transform RTMP url')
2505             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2506             video_url = base + m.group('finalid')
2507
2508             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2509             info = {
2510                 'id': shortMediaId,
2511                 'url': video_url,
2512                 'uploader': showId,
2513                 'upload_date': officialDate,
2514                 'title': effTitle,
2515                 'ext': 'mp4',
2516                 'format': format,
2517                 'thumbnail': None,
2518                 'description': officialTitle,
2519             }
2520             results.append(info)
2521
2522         return results
2523
2524
2525 class EscapistIE(InfoExtractor):
2526     """Information extractor for The Escapist """
2527
2528     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2529     IE_NAME = u'escapist'
2530
2531     def report_extraction(self, showName):
2532         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2533
2534     def report_config_download(self, showName):
2535         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2536
2537     def _real_extract(self, url):
2538         mobj = re.match(self._VALID_URL, url)
2539         if mobj is None:
2540             self._downloader.report_error(u'invalid URL: %s' % url)
2541             return
2542         showName = mobj.group('showname')
2543         videoId = mobj.group('episode')
2544
2545         self.report_extraction(showName)
2546         try:
2547             webPage = compat_urllib_request.urlopen(url)
2548             webPageBytes = webPage.read()
2549             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2550             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2551         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2552             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2553             return
2554
2555         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2556         description = unescapeHTML(descMatch.group(1))
2557         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2558         imgUrl = unescapeHTML(imgMatch.group(1))
2559         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2560         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2561         configUrlMatch = re.search('config=(.*)$', playerUrl)
2562         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2563
2564         self.report_config_download(showName)
2565         try:
2566             configJSON = compat_urllib_request.urlopen(configUrl)
2567             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2568             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2569         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2570             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2571             return
2572
2573         # Technically, it's JavaScript, not JSON
2574         configJSON = configJSON.replace("'", '"')
2575
2576         try:
2577             config = json.loads(configJSON)
2578         except (ValueError,) as err:
2579             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2580             return
2581
2582         playlist = config['playlist']
2583         videoUrl = playlist[1]['url']
2584
2585         info = {
2586             'id': videoId,
2587             'url': videoUrl,
2588             'uploader': showName,
2589             'upload_date': None,
2590             'title': showName,
2591             'ext': 'mp4',
2592             'thumbnail': imgUrl,
2593             'description': description,
2594             'player_url': playerUrl,
2595         }
2596
2597         return [info]
2598
2599 class CollegeHumorIE(InfoExtractor):
2600     """Information extractor for collegehumor.com"""
2601
2602     _WORKING = False
2603     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2604     IE_NAME = u'collegehumor'
2605
2606     def report_manifest(self, video_id):
2607         """Report information extraction."""
2608         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2609
2610     def report_extraction(self, video_id):
2611         """Report information extraction."""
2612         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2613
2614     def _real_extract(self, url):
2615         mobj = re.match(self._VALID_URL, url)
2616         if mobj is None:
2617             self._downloader.report_error(u'invalid URL: %s' % url)
2618             return
2619         video_id = mobj.group('videoid')
2620
2621         info = {
2622             'id': video_id,
2623             'uploader': None,
2624             'upload_date': None,
2625         }
2626
2627         self.report_extraction(video_id)
2628         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2629         try:
2630             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2631         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2632             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2633             return
2634
2635         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2636         try:
2637             videoNode = mdoc.findall('./video')[0]
2638             info['description'] = videoNode.findall('./description')[0].text
2639             info['title'] = videoNode.findall('./caption')[0].text
2640             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2641             manifest_url = videoNode.findall('./file')[0].text
2642         except IndexError:
2643             self._downloader.report_error(u'Invalid metadata XML file')
2644             return
2645
2646         manifest_url += '?hdcore=2.10.3'
2647         self.report_manifest(video_id)
2648         try:
2649             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2650         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2652             return
2653
2654         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2655         try:
2656             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2657             node_id = media_node.attrib['url']
2658             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2659         except IndexError as err:
2660             self._downloader.report_error(u'Invalid manifest file')
2661             return
2662
2663         url_pr = compat_urllib_parse_urlparse(manifest_url)
2664         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2665
2666         info['url'] = url
2667         info['ext'] = 'f4f'
2668         return [info]
2669
2670
2671 class XVideosIE(InfoExtractor):
2672     """Information extractor for xvideos.com"""
2673
2674     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2675     IE_NAME = u'xvideos'
2676
2677     def report_extraction(self, video_id):
2678         """Report information extraction."""
2679         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2680
2681     def _real_extract(self, url):
2682         mobj = re.match(self._VALID_URL, url)
2683         if mobj is None:
2684             self._downloader.report_error(u'invalid URL: %s' % url)
2685             return
2686         video_id = mobj.group(1)
2687
2688         webpage = self._download_webpage(url, video_id)
2689
2690         self.report_extraction(video_id)
2691
2692
2693         # Extract video URL
2694         mobj = re.search(r'flv_url=(.+?)&', webpage)
2695         if mobj is None:
2696             self._downloader.report_error(u'unable to extract video url')
2697             return
2698         video_url = compat_urllib_parse.unquote(mobj.group(1))
2699
2700
2701         # Extract title
2702         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2703         if mobj is None:
2704             self._downloader.report_error(u'unable to extract video title')
2705             return
2706         video_title = mobj.group(1)
2707
2708
2709         # Extract video thumbnail
2710         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2711         if mobj is None:
2712             self._downloader.report_error(u'unable to extract video thumbnail')
2713             return
2714         video_thumbnail = mobj.group(0)
2715
2716         info = {
2717             'id': video_id,
2718             'url': video_url,
2719             'uploader': None,
2720             'upload_date': None,
2721             'title': video_title,
2722             'ext': 'flv',
2723             'thumbnail': video_thumbnail,
2724             'description': None,
2725         }
2726
2727         return [info]
2728
2729
2730 class SoundcloudIE(InfoExtractor):
2731     """Information extractor for soundcloud.com
2732        To access the media, the uid of the song and a stream token
2733        must be extracted from the page source and the script must make
2734        a request to media.soundcloud.com/crossdomain.xml. Then
2735        the media can be grabbed by requesting from an url composed
2736        of the stream token and uid
2737      """
2738
2739     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2740     IE_NAME = u'soundcloud'
2741
2742     def __init__(self, downloader=None):
2743         InfoExtractor.__init__(self, downloader)
2744
2745     def report_resolve(self, video_id):
2746         """Report information extraction."""
2747         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2748
2749     def report_extraction(self, video_id):
2750         """Report information extraction."""
2751         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2752
2753     def _real_extract(self, url):
2754         mobj = re.match(self._VALID_URL, url)
2755         if mobj is None:
2756             self._downloader.report_error(u'invalid URL: %s' % url)
2757             return
2758
2759         # extract uploader (which is in the url)
2760         uploader = mobj.group(1)
2761         # extract simple title (uploader + slug of song title)
2762         slug_title =  mobj.group(2)
2763         simple_title = uploader + u'-' + slug_title
2764
2765         self.report_resolve('%s/%s' % (uploader, slug_title))
2766
2767         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2768         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2769         request = compat_urllib_request.Request(resolv_url)
2770         try:
2771             info_json_bytes = compat_urllib_request.urlopen(request).read()
2772             info_json = info_json_bytes.decode('utf-8')
2773         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2775             return
2776
2777         info = json.loads(info_json)
2778         video_id = info['id']
2779         self.report_extraction('%s/%s' % (uploader, slug_title))
2780
2781         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2782         request = compat_urllib_request.Request(streams_url)
2783         try:
2784             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2785             stream_json = stream_json_bytes.decode('utf-8')
2786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2787             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2788             return
2789
2790         streams = json.loads(stream_json)
2791         mediaURL = streams['http_mp3_128_url']
2792
2793         return [{
2794             'id':       info['id'],
2795             'url':      mediaURL,
2796             'uploader': info['user']['username'],
2797             'upload_date':  info['created_at'],
2798             'title':    info['title'],
2799             'ext':      u'mp3',
2800             'description': info['description'],
2801         }]
2802
2803 class SoundcloudSetIE(InfoExtractor):
2804     """Information extractor for soundcloud.com sets
2805        To access the media, the uid of the song and a stream token
2806        must be extracted from the page source and the script must make
2807        a request to media.soundcloud.com/crossdomain.xml. Then
2808        the media can be grabbed by requesting from an url composed
2809        of the stream token and uid
2810      """
2811
2812     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2813     IE_NAME = u'soundcloud'
2814
2815     def __init__(self, downloader=None):
2816         InfoExtractor.__init__(self, downloader)
2817
2818     def report_resolve(self, video_id):
2819         """Report information extraction."""
2820         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2821
2822     def report_extraction(self, video_id):
2823         """Report information extraction."""
2824         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2825
2826     def _real_extract(self, url):
2827         mobj = re.match(self._VALID_URL, url)
2828         if mobj is None:
2829             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2830             return
2831
2832         # extract uploader (which is in the url)
2833         uploader = mobj.group(1)
2834         # extract simple title (uploader + slug of song title)
2835         slug_title =  mobj.group(2)
2836         simple_title = uploader + u'-' + slug_title
2837
2838         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2839
2840         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2841         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2842         request = compat_urllib_request.Request(resolv_url)
2843         try:
2844             info_json_bytes = compat_urllib_request.urlopen(request).read()
2845             info_json = info_json_bytes.decode('utf-8')
2846         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2848             return
2849
2850         videos = []
2851         info = json.loads(info_json)
2852         if 'errors' in info:
2853             for err in info['errors']:
2854                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2855             return
2856
2857         for track in info['tracks']:
2858             video_id = track['id']
2859             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2860
2861             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2862             request = compat_urllib_request.Request(streams_url)
2863             try:
2864                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2865                 stream_json = stream_json_bytes.decode('utf-8')
2866             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2868                 return
2869
2870             streams = json.loads(stream_json)
2871             mediaURL = streams['http_mp3_128_url']
2872
2873             videos.append({
2874                 'id':       video_id,
2875                 'url':      mediaURL,
2876                 'uploader': track['user']['username'],
2877                 'upload_date':  track['created_at'],
2878                 'title':    track['title'],
2879                 'ext':      u'mp3',
2880                 'description': track['description'],
2881             })
2882         return videos
2883
2884
2885 class InfoQIE(InfoExtractor):
2886     """Information extractor for infoq.com"""
2887     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2888
2889     def report_extraction(self, video_id):
2890         """Report information extraction."""
2891         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2892
2893     def _real_extract(self, url):
2894         mobj = re.match(self._VALID_URL, url)
2895         if mobj is None:
2896             self._downloader.report_error(u'invalid URL: %s' % url)
2897             return
2898
2899         webpage = self._download_webpage(url, video_id=url)
2900         self.report_extraction(url)
2901
2902         # Extract video URL
2903         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2904         if mobj is None:
2905             self._downloader.report_error(u'unable to extract video url')
2906             return
2907         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2908         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2909
2910         # Extract title
2911         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2912         if mobj is None:
2913             self._downloader.report_error(u'unable to extract video title')
2914             return
2915         video_title = mobj.group(1)
2916
2917         # Extract description
2918         video_description = u'No description available.'
2919         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2920         if mobj is not None:
2921             video_description = mobj.group(1)
2922
2923         video_filename = video_url.split('/')[-1]
2924         video_id, extension = video_filename.split('.')
2925
2926         info = {
2927             'id': video_id,
2928             'url': video_url,
2929             'uploader': None,
2930             'upload_date': None,
2931             'title': video_title,
2932             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2933             'thumbnail': None,
2934             'description': video_description,
2935         }
2936
2937         return [info]
2938
2939 class MixcloudIE(InfoExtractor):
2940     """Information extractor for www.mixcloud.com"""
2941
2942     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2943     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2944     IE_NAME = u'mixcloud'
2945
2946     def __init__(self, downloader=None):
2947         InfoExtractor.__init__(self, downloader)
2948
2949     def report_download_json(self, file_id):
2950         """Report JSON download."""
2951         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2952
2953     def report_extraction(self, file_id):
2954         """Report information extraction."""
2955         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2956
2957     def get_urls(self, jsonData, fmt, bitrate='best'):
2958         """Get urls from 'audio_formats' section in json"""
2959         file_url = None
2960         try:
2961             bitrate_list = jsonData[fmt]
2962             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2963                 bitrate = max(bitrate_list) # select highest
2964
2965             url_list = jsonData[fmt][bitrate]
2966         except TypeError: # we have no bitrate info.
2967             url_list = jsonData[fmt]
2968         return url_list
2969
2970     def check_urls(self, url_list):
2971         """Returns 1st active url from list"""
2972         for url in url_list:
2973             try:
2974                 compat_urllib_request.urlopen(url)
2975                 return url
2976             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2977                 url = None
2978
2979         return None
2980
2981     def _print_formats(self, formats):
2982         print('Available formats:')
2983         for fmt in formats.keys():
2984             for b in formats[fmt]:
2985                 try:
2986                     ext = formats[fmt][b][0]
2987                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2988                 except TypeError: # we have no bitrate info
2989                     ext = formats[fmt][0]
2990                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2991                     break
2992
2993     def _real_extract(self, url):
2994         mobj = re.match(self._VALID_URL, url)
2995         if mobj is None:
2996             self._downloader.report_error(u'invalid URL: %s' % url)
2997             return
2998         # extract uploader & filename from url
2999         uploader = mobj.group(1).decode('utf-8')
3000         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3001
3002         # construct API request
3003         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3004         # retrieve .json file with links to files
3005         request = compat_urllib_request.Request(file_url)
3006         try:
3007             self.report_download_json(file_url)
3008             jsonData = compat_urllib_request.urlopen(request).read()
3009         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3010             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3011             return
3012
3013         # parse JSON
3014         json_data = json.loads(jsonData)
3015         player_url = json_data['player_swf_url']
3016         formats = dict(json_data['audio_formats'])
3017
3018         req_format = self._downloader.params.get('format', None)
3019         bitrate = None
3020
3021         if self._downloader.params.get('listformats', None):
3022             self._print_formats(formats)
3023             return
3024
3025         if req_format is None or req_format == 'best':
3026             for format_param in formats.keys():
3027                 url_list = self.get_urls(formats, format_param)
3028                 # check urls
3029                 file_url = self.check_urls(url_list)
3030                 if file_url is not None:
3031                     break # got it!
3032         else:
3033             if req_format not in formats:
3034                 self._downloader.report_error(u'format is not available')
3035                 return
3036
3037             url_list = self.get_urls(formats, req_format)
3038             file_url = self.check_urls(url_list)
3039             format_param = req_format
3040
3041         return [{
3042             'id': file_id.decode('utf-8'),
3043             'url': file_url.decode('utf-8'),
3044             'uploader': uploader.decode('utf-8'),
3045             'upload_date': None,
3046             'title': json_data['name'],
3047             'ext': file_url.split('.')[-1].decode('utf-8'),
3048             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3049             'thumbnail': json_data['thumbnail_url'],
3050             'description': json_data['description'],
3051             'player_url': player_url.decode('utf-8'),
3052         }]
3053
3054 class StanfordOpenClassroomIE(InfoExtractor):
3055     """Information extractor for Stanford's Open ClassRoom"""
3056
3057     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3058     IE_NAME = u'stanfordoc'
3059
3060     def report_download_webpage(self, objid):
3061         """Report information extraction."""
3062         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3063
3064     def report_extraction(self, video_id):
3065         """Report information extraction."""
3066         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3067
3068     def _real_extract(self, url):
3069         mobj = re.match(self._VALID_URL, url)
3070         if mobj is None:
3071             raise ExtractorError(u'Invalid URL: %s' % url)
3072
3073         if mobj.group('course') and mobj.group('video'): # A specific video
3074             course = mobj.group('course')
3075             video = mobj.group('video')
3076             info = {
3077                 'id': course + '_' + video,
3078                 'uploader': None,
3079                 'upload_date': None,
3080             }
3081
3082             self.report_extraction(info['id'])
3083             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3084             xmlUrl = baseUrl + video + '.xml'
3085             try:
3086                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3087             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3089                 return
3090             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3091             try:
3092                 info['title'] = mdoc.findall('./title')[0].text
3093                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3094             except IndexError:
3095                 self._downloader.report_error(u'Invalid metadata XML file')
3096                 return
3097             info['ext'] = info['url'].rpartition('.')[2]
3098             return [info]
3099         elif mobj.group('course'): # A course page
3100             course = mobj.group('course')
3101             info = {
3102                 'id': course,
3103                 'type': 'playlist',
3104                 'uploader': None,
3105                 'upload_date': None,
3106             }
3107
3108             coursepage = self._download_webpage(url, info['id'],
3109                                         note='Downloading course info page',
3110                                         errnote='Unable to download course info page')
3111
3112             m = re.search('<h1>([^<]+)</h1>', coursepage)
3113             if m:
3114                 info['title'] = unescapeHTML(m.group(1))
3115             else:
3116                 info['title'] = info['id']
3117
3118             m = re.search('<description>([^<]+)</description>', coursepage)
3119             if m:
3120                 info['description'] = unescapeHTML(m.group(1))
3121
3122             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3123             info['list'] = [
3124                 {
3125                     'type': 'reference',
3126                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3127                 }
3128                     for vpage in links]
3129             results = []
3130             for entry in info['list']:
3131                 assert entry['type'] == 'reference'
3132                 results += self.extract(entry['url'])
3133             return results
3134         else: # Root page
3135             info = {
3136                 'id': 'Stanford OpenClassroom',
3137                 'type': 'playlist',
3138                 'uploader': None,
3139                 'upload_date': None,
3140             }
3141
3142             self.report_download_webpage(info['id'])
3143             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3144             try:
3145                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3146             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3147                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3148                 return
3149
3150             info['title'] = info['id']
3151
3152             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3153             info['list'] = [
3154                 {
3155                     'type': 'reference',
3156                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3157                 }
3158                     for cpage in links]
3159
3160             results = []
3161             for entry in info['list']:
3162                 assert entry['type'] == 'reference'
3163                 results += self.extract(entry['url'])
3164             return results
3165
3166 class MTVIE(InfoExtractor):
3167     """Information extractor for MTV.com"""
3168
3169     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3170     IE_NAME = u'mtv'
3171
3172     def report_extraction(self, video_id):
3173         """Report information extraction."""
3174         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3175
3176     def _real_extract(self, url):
3177         mobj = re.match(self._VALID_URL, url)
3178         if mobj is None:
3179             self._downloader.report_error(u'invalid URL: %s' % url)
3180             return
3181         if not mobj.group('proto'):
3182             url = 'http://' + url
3183         video_id = mobj.group('videoid')
3184
3185         webpage = self._download_webpage(url, video_id)
3186
3187         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3188         if mobj is None:
3189             self._downloader.report_error(u'unable to extract song name')
3190             return
3191         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3192         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3193         if mobj is None:
3194             self._downloader.report_error(u'unable to extract performer')
3195             return
3196         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3197         video_title = performer + ' - ' + song_name
3198
3199         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3200         if mobj is None:
3201             self._downloader.report_error(u'unable to mtvn_uri')
3202             return
3203         mtvn_uri = mobj.group(1)
3204
3205         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3206         if mobj is None:
3207             self._downloader.report_error(u'unable to extract content id')
3208             return
3209         content_id = mobj.group(1)
3210
3211         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3212         self.report_extraction(video_id)
3213         request = compat_urllib_request.Request(videogen_url)
3214         try:
3215             metadataXml = compat_urllib_request.urlopen(request).read()
3216         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3217             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3218             return
3219
3220         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3221         renditions = mdoc.findall('.//rendition')
3222
3223         # For now, always pick the highest quality.
3224         rendition = renditions[-1]
3225
3226         try:
3227             _,_,ext = rendition.attrib['type'].partition('/')
3228             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3229             video_url = rendition.find('./src').text
3230         except KeyError:
3231             self._downloader.trouble('Invalid rendition field.')
3232             return
3233
3234         info = {
3235             'id': video_id,
3236             'url': video_url,
3237             'uploader': performer,
3238             'upload_date': None,
3239             'title': video_title,
3240             'ext': ext,
3241             'format': format,
3242         }
3243
3244         return [info]
3245
3246
3247 class YoukuIE(InfoExtractor):
3248     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3249
3250     def report_download_webpage(self, file_id):
3251         """Report webpage download."""
3252         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3253
3254     def report_extraction(self, file_id):
3255         """Report information extraction."""
3256         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3257
3258     def _gen_sid(self):
3259         nowTime = int(time.time() * 1000)
3260         random1 = random.randint(1000,1998)
3261         random2 = random.randint(1000,9999)
3262
3263         return "%d%d%d" %(nowTime,random1,random2)
3264
3265     def _get_file_ID_mix_string(self, seed):
3266         mixed = []
3267         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3268         seed = float(seed)
3269         for i in range(len(source)):
3270             seed  =  (seed * 211 + 30031 ) % 65536
3271             index  =  math.floor(seed / 65536 * len(source) )
3272             mixed.append(source[int(index)])
3273             source.remove(source[int(index)])
3274         #return ''.join(mixed)
3275         return mixed
3276
3277     def _get_file_id(self, fileId, seed):
3278         mixed = self._get_file_ID_mix_string(seed)
3279         ids = fileId.split('*')
3280         realId = []
3281         for ch in ids:
3282             if ch:
3283                 realId.append(mixed[int(ch)])
3284         return ''.join(realId)
3285
3286     def _real_extract(self, url):
3287         mobj = re.match(self._VALID_URL, url)
3288         if mobj is None:
3289             self._downloader.report_error(u'invalid URL: %s' % url)
3290             return
3291         video_id = mobj.group('ID')
3292
3293         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3294
3295         request = compat_urllib_request.Request(info_url, None, std_headers)
3296         try:
3297             self.report_download_webpage(video_id)
3298             jsondata = compat_urllib_request.urlopen(request).read()
3299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3300             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3301             return
3302
3303         self.report_extraction(video_id)
3304         try:
3305             jsonstr = jsondata.decode('utf-8')
3306             config = json.loads(jsonstr)
3307
3308             video_title =  config['data'][0]['title']
3309             seed = config['data'][0]['seed']
3310
3311             format = self._downloader.params.get('format', None)
3312             supported_format = list(config['data'][0]['streamfileids'].keys())
3313
3314             if format is None or format == 'best':
3315                 if 'hd2' in supported_format:
3316                     format = 'hd2'
3317                 else:
3318                     format = 'flv'
3319                 ext = u'flv'
3320             elif format == 'worst':
3321                 format = 'mp4'
3322                 ext = u'mp4'
3323             else:
3324                 format = 'flv'
3325                 ext = u'flv'
3326
3327
3328             fileid = config['data'][0]['streamfileids'][format]
3329             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3330         except (UnicodeDecodeError, ValueError, KeyError):
3331             self._downloader.report_error(u'unable to extract info section')
3332             return
3333
3334         files_info=[]
3335         sid = self._gen_sid()
3336         fileid = self._get_file_id(fileid, seed)
3337
3338         #column 8,9 of fileid represent the segment number
3339         #fileid[7:9] should be changed
3340         for index, key in enumerate(keys):
3341
3342             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3343             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3344
3345             info = {
3346                 'id': '%s_part%02d' % (video_id, index),
3347                 'url': download_url,
3348                 'uploader': None,
3349                 'upload_date': None,
3350                 'title': video_title,
3351                 'ext': ext,
3352             }
3353             files_info.append(info)
3354
3355         return files_info
3356
3357
3358 class XNXXIE(InfoExtractor):
3359     """Information extractor for xnxx.com"""
3360
3361     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3362     IE_NAME = u'xnxx'
3363     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3364     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3365     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3366
3367     def report_webpage(self, video_id):
3368         """Report information extraction"""
3369         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3370
3371     def report_extraction(self, video_id):
3372         """Report information extraction"""
3373         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3374
3375     def _real_extract(self, url):
3376         mobj = re.match(self._VALID_URL, url)
3377         if mobj is None:
3378             self._downloader.report_error(u'invalid URL: %s' % url)
3379             return
3380         video_id = mobj.group(1)
3381
3382         self.report_webpage(video_id)
3383
3384         # Get webpage content
3385         try:
3386             webpage_bytes = compat_urllib_request.urlopen(url).read()
3387             webpage = webpage_bytes.decode('utf-8')
3388         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3390             return
3391
3392         result = re.search(self.VIDEO_URL_RE, webpage)
3393         if result is None:
3394             self._downloader.report_error(u'unable to extract video url')
3395             return
3396         video_url = compat_urllib_parse.unquote(result.group(1))
3397
3398         result = re.search(self.VIDEO_TITLE_RE, webpage)
3399         if result is None:
3400             self._downloader.report_error(u'unable to extract video title')
3401             return
3402         video_title = result.group(1)
3403
3404         result = re.search(self.VIDEO_THUMB_RE, webpage)
3405         if result is None:
3406             self._downloader.report_error(u'unable to extract video thumbnail')
3407             return
3408         video_thumbnail = result.group(1)
3409
3410         return [{
3411             'id': video_id,
3412             'url': video_url,
3413             'uploader': None,
3414             'upload_date': None,
3415             'title': video_title,
3416             'ext': 'flv',
3417             'thumbnail': video_thumbnail,
3418             'description': None,
3419         }]
3420
3421
3422 class GooglePlusIE(InfoExtractor):
3423     """Information extractor for plus.google.com."""
3424
3425     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3426     IE_NAME = u'plus.google'
3427
3428     def __init__(self, downloader=None):
3429         InfoExtractor.__init__(self, downloader)
3430
3431     def report_extract_entry(self, url):
3432         """Report downloading extry"""
3433         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3434
3435     def report_date(self, upload_date):
3436         """Report downloading extry"""
3437         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3438
3439     def report_uploader(self, uploader):
3440         """Report downloading extry"""
3441         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3442
3443     def report_title(self, video_title):
3444         """Report downloading extry"""
3445         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3446
3447     def report_extract_vid_page(self, video_page):
3448         """Report information extraction."""
3449         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3450
3451     def _real_extract(self, url):
3452         # Extract id from URL
3453         mobj = re.match(self._VALID_URL, url)
3454         if mobj is None:
3455             self._downloader.report_error(u'Invalid URL: %s' % url)
3456             return
3457
3458         post_url = mobj.group(0)
3459         video_id = mobj.group(1)
3460
3461         video_extension = 'flv'
3462
3463         # Step 1, Retrieve post webpage to extract further information
3464         self.report_extract_entry(post_url)
3465         request = compat_urllib_request.Request(post_url)
3466         try:
3467             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3468         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3469             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3470             return
3471
3472         # Extract update date
3473         upload_date = None
3474         pattern = 'title="Timestamp">(.*?)</a>'
3475         mobj = re.search(pattern, webpage)
3476         if mobj:
3477             upload_date = mobj.group(1)
3478             # Convert timestring to a format suitable for filename
3479             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3480             upload_date = upload_date.strftime('%Y%m%d')
3481         self.report_date(upload_date)
3482
3483         # Extract uploader
3484         uploader = None
3485         pattern = r'rel\="author".*?>(.*?)</a>'
3486         mobj = re.search(pattern, webpage)
3487         if mobj:
3488             uploader = mobj.group(1)
3489         self.report_uploader(uploader)
3490
3491         # Extract title
3492         # Get the first line for title
3493         video_title = u'NA'
3494         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3495         mobj = re.search(pattern, webpage)
3496         if mobj:
3497             video_title = mobj.group(1)
3498         self.report_title(video_title)
3499
3500         # Step 2, Stimulate clicking the image box to launch video
3501         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3502         mobj = re.search(pattern, webpage)
3503         if mobj is None:
3504             self._downloader.report_error(u'unable to extract video page URL')
3505
3506         video_page = mobj.group(1)
3507         request = compat_urllib_request.Request(video_page)
3508         try:
3509             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3510         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3511             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3512             return
3513         self.report_extract_vid_page(video_page)
3514
3515
3516         # Extract video links on video page
3517         """Extract video links of all sizes"""
3518         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3519         mobj = re.findall(pattern, webpage)
3520         if len(mobj) == 0:
3521             self._downloader.report_error(u'unable to extract video links')
3522
3523         # Sort in resolution
3524         links = sorted(mobj)
3525
3526         # Choose the lowest of the sort, i.e. highest resolution
3527         video_url = links[-1]
3528         # Only get the url. The resolution part in the tuple has no use anymore
3529         video_url = video_url[-1]
3530         # Treat escaped \u0026 style hex
3531         try:
3532             video_url = video_url.decode("unicode_escape")
3533         except AttributeError: # Python 3
3534             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3535
3536
3537         return [{
3538             'id':       video_id,
3539             'url':      video_url,
3540             'uploader': uploader,
3541             'upload_date':  upload_date,
3542             'title':    video_title,
3543             'ext':      video_extension,
3544         }]
3545
3546 class NBAIE(InfoExtractor):
3547     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3548     IE_NAME = u'nba'
3549
3550     def _real_extract(self, url):
3551         mobj = re.match(self._VALID_URL, url)
3552         if mobj is None:
3553             self._downloader.report_error(u'invalid URL: %s' % url)
3554             return
3555
3556         video_id = mobj.group(1)
3557         if video_id.endswith('/index.html'):
3558             video_id = video_id[:-len('/index.html')]
3559
3560         webpage = self._download_webpage(url, video_id)
3561
3562         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3563         def _findProp(rexp, default=None):
3564             m = re.search(rexp, webpage)
3565             if m:
3566                 return unescapeHTML(m.group(1))
3567             else:
3568                 return default
3569
3570         shortened_video_id = video_id.rpartition('/')[2]
3571         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3572         info = {
3573             'id': shortened_video_id,
3574             'url': video_url,
3575             'ext': 'mp4',
3576             'title': title,
3577             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3578             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3579         }
3580         return [info]
3581
3582 class JustinTVIE(InfoExtractor):
3583     """Information extractor for justin.tv and twitch.tv"""
3584     # TODO: One broadcast may be split into multiple videos. The key
3585     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3586     # starts at 1 and increases. Can we treat all parts as one video?
3587
3588     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3589         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3590     _JUSTIN_PAGE_LIMIT = 100
3591     IE_NAME = u'justin.tv'
3592
3593     def report_extraction(self, file_id):
3594         """Report information extraction."""
3595         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3596
3597     def report_download_page(self, channel, offset):
3598         """Report attempt to download a single page of videos."""
3599         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3600                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3601
3602     # Return count of items, list of *valid* items
3603     def _parse_page(self, url):
3604         try:
3605             urlh = compat_urllib_request.urlopen(url)
3606             webpage_bytes = urlh.read()
3607             webpage = webpage_bytes.decode('utf-8', 'ignore')
3608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3609             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3610             return
3611
3612         response = json.loads(webpage)
3613         if type(response) != list:
3614             error_text = response.get('error', 'unknown error')
3615             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3616             return
3617         info = []
3618         for clip in response:
3619             video_url = clip['video_file_url']
3620             if video_url:
3621                 video_extension = os.path.splitext(video_url)[1][1:]
3622                 video_date = re.sub('-', '', clip['start_time'][:10])
3623                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3624                 video_id = clip['id']
3625                 video_title = clip.get('title', video_id)
3626                 info.append({
3627                     'id': video_id,
3628                     'url': video_url,
3629                     'title': video_title,
3630                     'uploader': clip.get('channel_name', video_uploader_id),
3631                     'uploader_id': video_uploader_id,
3632                     'upload_date': video_date,
3633                     'ext': video_extension,
3634                 })
3635         return (len(response), info)
3636
3637     def _real_extract(self, url):
3638         mobj = re.match(self._VALID_URL, url)
3639         if mobj is None:
3640             self._downloader.report_error(u'invalid URL: %s' % url)
3641             return
3642
3643         api = 'http://api.justin.tv'
3644         video_id = mobj.group(mobj.lastindex)
3645         paged = False
3646         if mobj.lastindex == 1:
3647             paged = True
3648             api += '/channel/archives/%s.json'
3649         else:
3650             api += '/broadcast/by_archive/%s.json'
3651         api = api % (video_id,)
3652
3653         self.report_extraction(video_id)
3654
3655         info = []
3656         offset = 0
3657         limit = self._JUSTIN_PAGE_LIMIT
3658         while True:
3659             if paged:
3660                 self.report_download_page(video_id, offset)
3661             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3662             page_count, page_info = self._parse_page(page_url)
3663             info.extend(page_info)
3664             if not paged or page_count != limit:
3665                 break
3666             offset += limit
3667         return info
3668
3669 class FunnyOrDieIE(InfoExtractor):
3670     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3671
3672     def _real_extract(self, url):
3673         mobj = re.match(self._VALID_URL, url)
3674         if mobj is None:
3675             self._downloader.report_error(u'invalid URL: %s' % url)
3676             return
3677
3678         video_id = mobj.group('id')
3679         webpage = self._download_webpage(url, video_id)
3680
3681         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3682         if not m:
3683             self._downloader.report_error(u'unable to find video information')
3684         video_url = unescapeHTML(m.group('url'))
3685
3686         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3687         if not m:
3688             self._downloader.trouble(u'Cannot find video title')
3689         title = clean_html(m.group('title'))
3690
3691         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3692         if m:
3693             desc = unescapeHTML(m.group('desc'))
3694         else:
3695             desc = None
3696
3697         info = {
3698             'id': video_id,
3699             'url': video_url,
3700             'ext': 'mp4',
3701             'title': title,
3702             'description': desc,
3703         }
3704         return [info]
3705
3706 class SteamIE(InfoExtractor):
3707     _VALID_URL = r"""http://store.steampowered.com/
3708                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3709                 (?P<gameID>\d+)/?
3710                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3711                 """
3712
3713     @classmethod
3714     def suitable(cls, url):
3715         """Receives a URL and returns True if suitable for this IE."""
3716         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3717
3718     def _real_extract(self, url):
3719         m = re.match(self._VALID_URL, url, re.VERBOSE)
3720         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3721         gameID = m.group('gameID')
3722         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3723         webpage = self._download_webpage(videourl, gameID)
3724         mweb = re.finditer(urlRE, webpage)
3725         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3726         titles = re.finditer(namesRE, webpage)
3727         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3728         thumbs = re.finditer(thumbsRE, webpage)
3729         videos = []
3730         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3731             video_id = vid.group('videoID')
3732             title = vtitle.group('videoName')
3733             video_url = vid.group('videoURL')
3734             video_thumb = thumb.group('thumbnail')
3735             if not video_url:
3736                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3737             info = {
3738                 'id':video_id,
3739                 'url':video_url,
3740                 'ext': 'flv',
3741                 'title': unescapeHTML(title),
3742                 'thumbnail': video_thumb
3743                   }
3744             videos.append(info)
3745         return videos
3746
3747 class UstreamIE(InfoExtractor):
3748     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3749     IE_NAME = u'ustream'
3750
3751     def _real_extract(self, url):
3752         m = re.match(self._VALID_URL, url)
3753         video_id = m.group('videoID')
3754         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3755         webpage = self._download_webpage(url, video_id)
3756         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3757         title = m.group('title')
3758         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3759         uploader = m.group('uploader')
3760         info = {
3761                 'id':video_id,
3762                 'url':video_url,
3763                 'ext': 'flv',
3764                 'title': title,
3765                 'uploader': uploader
3766                   }
3767         return [info]
3768
3769 class WorldStarHipHopIE(InfoExtractor):
3770     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3771     IE_NAME = u'WorldStarHipHop'
3772
3773     def _real_extract(self, url):
3774         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3775
3776         webpage_src = compat_urllib_request.urlopen(url).read()
3777         webpage_src = webpage_src.decode('utf-8')
3778
3779         mobj = re.search(_src_url, webpage_src)
3780
3781         m = re.match(self._VALID_URL, url)
3782         video_id = m.group('id')
3783
3784         if mobj is not None:
3785             video_url = mobj.group()
3786             if 'mp4' in video_url:
3787                 ext = 'mp4'
3788             else:
3789                 ext = 'flv'
3790         else:
3791             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3792             return
3793
3794         _title = r"""<title>(.*)</title>"""
3795
3796         mobj = re.search(_title, webpage_src)
3797
3798         if mobj is not None:
3799             title = mobj.group(1)
3800         else:
3801             title = 'World Start Hip Hop - %s' % time.ctime()
3802
3803         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3804         mobj = re.search(_thumbnail, webpage_src)
3805
3806         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3807         if mobj is not None:
3808             thumbnail = mobj.group(1)
3809         else:
3810             _title = r"""candytitles.*>(.*)</span>"""
3811             mobj = re.search(_title, webpage_src)
3812             if mobj is not None:
3813                 title = mobj.group(1)
3814             thumbnail = None
3815
3816         results = [{
3817                     'id': video_id,
3818                     'url' : video_url,
3819                     'title' : title,
3820                     'thumbnail' : thumbnail,
3821                     'ext' : ext,
3822                     }]
3823         return results
3824
3825 class RBMARadioIE(InfoExtractor):
3826     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3827
3828     def _real_extract(self, url):
3829         m = re.match(self._VALID_URL, url)
3830         video_id = m.group('videoID')
3831
3832         webpage = self._download_webpage(url, video_id)
3833         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3834         if not m:
3835             raise ExtractorError(u'Cannot find metadata')
3836         json_data = m.group(1)
3837
3838         try:
3839             data = json.loads(json_data)
3840         except ValueError as e:
3841             raise ExtractorError(u'Invalid JSON: ' + str(e))
3842
3843         video_url = data['akamai_url'] + '&cbr=256'
3844         url_parts = compat_urllib_parse_urlparse(video_url)
3845         video_ext = url_parts.path.rpartition('.')[2]
3846         info = {
3847                 'id': video_id,
3848                 'url': video_url,
3849                 'ext': video_ext,
3850                 'title': data['title'],
3851                 'description': data.get('teaser_text'),
3852                 'location': data.get('country_of_origin'),
3853                 'uploader': data.get('host', {}).get('name'),
3854                 'uploader_id': data.get('host', {}).get('slug'),
3855                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3856                 'duration': data.get('duration'),
3857         }
3858         return [info]
3859
3860
3861 class YouPornIE(InfoExtractor):
3862     """Information extractor for youporn.com."""
3863     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3864
3865     def _print_formats(self, formats):
3866         """Print all available formats"""
3867         print(u'Available formats:')
3868         print(u'ext\t\tformat')
3869         print(u'---------------------------------')
3870         for format in formats:
3871             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3872
3873     def _specific(self, req_format, formats):
3874         for x in formats:
3875             if(x["format"]==req_format):
3876                 return x
3877         return None
3878
3879     def _real_extract(self, url):
3880         mobj = re.match(self._VALID_URL, url)
3881         if mobj is None:
3882             self._downloader.report_error(u'invalid URL: %s' % url)
3883             return
3884
3885         video_id = mobj.group('videoid')
3886
3887         req = compat_urllib_request.Request(url)
3888         req.add_header('Cookie', 'age_verified=1')
3889         webpage = self._download_webpage(req, video_id)
3890
3891         # Get the video title
3892         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3893         if result is None:
3894             raise ExtractorError(u'Unable to extract video title')
3895         video_title = result.group('title').strip()
3896
3897         # Get the video date
3898         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3899         if result is None:
3900             self._downloader.report_warning(u'unable to extract video date')
3901             upload_date = None
3902         else:
3903             upload_date = result.group('date').strip()
3904
3905         # Get the video uploader
3906         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3907         if result is None:
3908             self._downloader.report_warning(u'unable to extract uploader')
3909             video_uploader = None
3910         else:
3911             video_uploader = result.group('uploader').strip()
3912             video_uploader = clean_html( video_uploader )
3913
3914         # Get all of the formats available
3915         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3916         result = re.search(DOWNLOAD_LIST_RE, webpage)
3917         if result is None:
3918             raise ExtractorError(u'Unable to extract download list')
3919         download_list_html = result.group('download_list').strip()
3920
3921         # Get all of the links from the page
3922         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3923         links = re.findall(LINK_RE, download_list_html)
3924         if(len(links) == 0):
3925             raise ExtractorError(u'ERROR: no known formats available for video')
3926
3927         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3928
3929         formats = []
3930         for link in links:
3931
3932             # A link looks like this:
3933             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3934             # A path looks like this:
3935             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3936             video_url = unescapeHTML( link )
3937             path = compat_urllib_parse_urlparse( video_url ).path
3938             extension = os.path.splitext( path )[1][1:]
3939             format = path.split('/')[4].split('_')[:2]
3940             size = format[0]
3941             bitrate = format[1]
3942             format = "-".join( format )
3943             title = u'%s-%s-%s' % (video_title, size, bitrate)
3944
3945             formats.append({
3946                 'id': video_id,
3947                 'url': video_url,
3948                 'uploader': video_uploader,
3949                 'upload_date': upload_date,
3950                 'title': title,
3951                 'ext': extension,
3952                 'format': format,
3953                 'thumbnail': None,
3954                 'description': None,
3955                 'player_url': None
3956             })
3957
3958         if self._downloader.params.get('listformats', None):
3959             self._print_formats(formats)
3960             return
3961
3962         req_format = self._downloader.params.get('format', None)
3963         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3964
3965         if req_format is None or req_format == 'best':
3966             return [formats[0]]
3967         elif req_format == 'worst':
3968             return [formats[-1]]
3969         elif req_format in ('-1', 'all'):
3970             return formats
3971         else:
3972             format = self._specific( req_format, formats )
3973             if result is None:
3974                 self._downloader.report_error(u'requested format not available')
3975                 return
3976             return [format]
3977
3978
3979
3980 class PornotubeIE(InfoExtractor):
3981     """Information extractor for pornotube.com."""
3982     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3983
3984     def _real_extract(self, url):
3985         mobj = re.match(self._VALID_URL, url)
3986         if mobj is None:
3987             self._downloader.report_error(u'invalid URL: %s' % url)
3988             return
3989
3990         video_id = mobj.group('videoid')
3991         video_title = mobj.group('title')
3992
3993         # Get webpage content
3994         webpage = self._download_webpage(url, video_id)
3995
3996         # Get the video URL
3997         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3998         result = re.search(VIDEO_URL_RE, webpage)
3999         if result is None:
4000             self._downloader.report_error(u'unable to extract video url')
4001             return
4002         video_url = compat_urllib_parse.unquote(result.group('url'))
4003
4004         #Get the uploaded date
4005         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4006         result = re.search(VIDEO_UPLOADED_RE, webpage)
4007         if result is None:
4008             self._downloader.report_error(u'unable to extract video title')
4009             return
4010         upload_date = result.group('date')
4011
4012         info = {'id': video_id,
4013                 'url': video_url,
4014                 'uploader': None,
4015                 'upload_date': upload_date,
4016                 'title': video_title,
4017                 'ext': 'flv',
4018                 'format': 'flv'}
4019
4020         return [info]
4021
4022 class YouJizzIE(InfoExtractor):
4023     """Information extractor for youjizz.com."""
4024     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4025
4026     def _real_extract(self, url):
4027         mobj = re.match(self._VALID_URL, url)
4028         if mobj is None:
4029             self._downloader.report_error(u'invalid URL: %s' % url)
4030             return
4031
4032         video_id = mobj.group('videoid')
4033
4034         # Get webpage content
4035         webpage = self._download_webpage(url, video_id)
4036
4037         # Get the video title
4038         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4039         if result is None:
4040             raise ExtractorError(u'ERROR: unable to extract video title')
4041         video_title = result.group('title').strip()
4042
4043         # Get the embed page
4044         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4045         if result is None:
4046             raise ExtractorError(u'ERROR: unable to extract embed page')
4047
4048         embed_page_url = result.group(0).strip()
4049         video_id = result.group('videoid')
4050
4051         webpage = self._download_webpage(embed_page_url, video_id)
4052
4053         # Get the video URL
4054         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4055         if result is None:
4056             raise ExtractorError(u'ERROR: unable to extract video url')
4057         video_url = result.group('source')
4058
4059         info = {'id': video_id,
4060                 'url': video_url,
4061                 'title': video_title,
4062                 'ext': 'flv',
4063                 'format': 'flv',
4064                 'player_url': embed_page_url}
4065
4066         return [info]
4067
4068 class EightTracksIE(InfoExtractor):
4069     IE_NAME = '8tracks'
4070     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4071
4072     def _real_extract(self, url):
4073         mobj = re.match(self._VALID_URL, url)
4074         if mobj is None:
4075             raise ExtractorError(u'Invalid URL: %s' % url)
4076         playlist_id = mobj.group('id')
4077
4078         webpage = self._download_webpage(url, playlist_id)
4079
4080         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4081         if not m:
4082             raise ExtractorError(u'Cannot find trax information')
4083         json_like = m.group(1)
4084         data = json.loads(json_like)
4085
4086         session = str(random.randint(0, 1000000000))
4087         mix_id = data['id']
4088         track_count = data['tracks_count']
4089         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4090         next_url = first_url
4091         res = []
4092         for i in itertools.count():
4093             api_json = self._download_webpage(next_url, playlist_id,
4094                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4095                 errnote=u'Failed to download song information')
4096             api_data = json.loads(api_json)
4097             track_data = api_data[u'set']['track']
4098             info = {
4099                 'id': track_data['id'],
4100                 'url': track_data['track_file_stream_url'],
4101                 'title': track_data['performer'] + u' - ' + track_data['name'],
4102                 'raw_title': track_data['name'],
4103                 'uploader_id': data['user']['login'],
4104                 'ext': 'm4a',
4105             }
4106             res.append(info)
4107             if api_data['set']['at_last_track']:
4108                 break
4109             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4110         return res
4111
4112 class KeekIE(InfoExtractor):
4113     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4114     IE_NAME = u'keek'
4115
4116     def _real_extract(self, url):
4117         m = re.match(self._VALID_URL, url)
4118         video_id = m.group('videoID')
4119         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4120         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4121         webpage = self._download_webpage(url, video_id)
4122         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4123         title = unescapeHTML(m.group('title'))
4124         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4125         uploader = clean_html(m.group('uploader'))
4126         info = {
4127                 'id': video_id,
4128                 'url': video_url,
4129                 'ext': 'mp4',
4130                 'title': title,
4131                 'thumbnail': thumbnail,
4132                 'uploader': uploader
4133         }
4134         return [info]
4135
4136 class TEDIE(InfoExtractor):
4137     _VALID_URL=r'''http://www.ted.com/
4138                    (
4139                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4140                         |
4141                         ((?P<type_talk>talks)) # We have a simple talk
4142                    )
4143                    /(?P<name>\w+) # Here goes the name and then ".html"
4144                    '''
4145
4146     @classmethod
4147     def suitable(cls, url):
4148         """Receives a URL and returns True if suitable for this IE."""
4149         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4150
4151     def _real_extract(self, url):
4152         m=re.match(self._VALID_URL, url, re.VERBOSE)
4153         if m.group('type_talk'):
4154             return [self._talk_info(url)]
4155         else :
4156             playlist_id=m.group('playlist_id')
4157             name=m.group('name')
4158             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4159             return self._playlist_videos_info(url,name,playlist_id)
4160
4161     def _talk_video_link(self,mediaSlug):
4162         '''Returns the video link for that mediaSlug'''
4163         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4164
4165     def _playlist_videos_info(self,url,name,playlist_id=0):
4166         '''Returns the videos of the playlist'''
4167         video_RE=r'''
4168                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4169                      ([.\s]*?)data-playlist_item_id="(\d+)"
4170                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4171                      '''
4172         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4173         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4174         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4175         m_names=re.finditer(video_name_RE,webpage)
4176         info=[]
4177         for m_video, m_name in zip(m_videos,m_names):
4178             video_id=m_video.group('video_id')
4179             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4180             info.append(self._talk_info(talk_url,video_id))
4181         return info
4182
4183     def _talk_info(self, url, video_id=0):
4184         """Return the video for the talk in the url"""
4185         m=re.match(self._VALID_URL, url,re.VERBOSE)
4186         videoName=m.group('name')
4187         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4188         # If the url includes the language we get the title translated
4189         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4190         title=re.search(title_RE, webpage).group('title')
4191         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4192                         "id":(?P<videoID>[\d]+).*?
4193                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4194         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4195         thumb_match=re.search(thumb_RE,webpage)
4196         info_match=re.search(info_RE,webpage,re.VERBOSE)
4197         video_id=info_match.group('videoID')
4198         mediaSlug=info_match.group('mediaSlug')
4199         video_url=self._talk_video_link(mediaSlug)
4200         info = {
4201                 'id': video_id,
4202                 'url': video_url,
4203                 'ext': 'mp4',
4204                 'title': title,
4205                 'thumbnail': thumb_match.group('thumbnail')
4206                 }
4207         return info
4208
4209 class MySpassIE(InfoExtractor):
4210     _VALID_URL = r'http://www.myspass.de/.*'
4211
4212     def _real_extract(self, url):
4213         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4214
4215         # video id is the last path element of the URL
4216         # usually there is a trailing slash, so also try the second but last
4217         url_path = compat_urllib_parse_urlparse(url).path
4218         url_parent_path, video_id = os.path.split(url_path)
4219         if not video_id:
4220             _, video_id = os.path.split(url_parent_path)
4221
4222         # get metadata
4223         metadata_url = META_DATA_URL_TEMPLATE % video_id
4224         metadata_text = self._download_webpage(metadata_url, video_id)
4225         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4226
4227         # extract values from metadata
4228         url_flv_el = metadata.find('url_flv')
4229         if url_flv_el is None:
4230             self._downloader.report_error(u'unable to extract download url')
4231             return
4232         video_url = url_flv_el.text
4233         extension = os.path.splitext(video_url)[1][1:]
4234         title_el = metadata.find('title')
4235         if title_el is None:
4236             self._downloader.report_error(u'unable to extract title')
4237             return
4238         title = title_el.text
4239         format_id_el = metadata.find('format_id')
4240         if format_id_el is None:
4241             format = ext
4242         else:
4243             format = format_id_el.text
4244         description_el = metadata.find('description')
4245         if description_el is not None:
4246             description = description_el.text
4247         else:
4248             description = None
4249         imagePreview_el = metadata.find('imagePreview')
4250         if imagePreview_el is not None:
4251             thumbnail = imagePreview_el.text
4252         else:
4253             thumbnail = None
4254         info = {
4255             'id': video_id,
4256             'url': video_url,
4257             'title': title,
4258             'ext': extension,
4259             'format': format,
4260             'thumbnail': thumbnail,
4261             'description': description
4262         }
4263         return [info]
4264
4265 class SpiegelIE(InfoExtractor):
4266     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4267
4268     def _real_extract(self, url):
4269         m = re.match(self._VALID_URL, url)
4270         video_id = m.group('videoID')
4271
4272         webpage = self._download_webpage(url, video_id)
4273         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4274         if not m:
4275             raise ExtractorError(u'Cannot find title')
4276         video_title = unescapeHTML(m.group(1))
4277
4278         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4279         xml_code = self._download_webpage(xml_url, video_id,
4280                     note=u'Downloading XML', errnote=u'Failed to download XML')
4281
4282         idoc = xml.etree.ElementTree.fromstring(xml_code)
4283         last_type = idoc[-1]
4284         filename = last_type.findall('./filename')[0].text
4285         duration = float(last_type.findall('./duration')[0].text)
4286
4287         video_url = 'http://video2.spiegel.de/flash/' + filename
4288         video_ext = filename.rpartition('.')[2]
4289         info = {
4290             'id': video_id,
4291             'url': video_url,
4292             'ext': video_ext,
4293             'title': video_title,
4294             'duration': duration,
4295         }
4296         return [info]
4297
4298 class LiveLeakIE(InfoExtractor):
4299
4300     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4301     IE_NAME = u'liveleak'
4302
4303     def _real_extract(self, url):
4304         mobj = re.match(self._VALID_URL, url)
4305         if mobj is None:
4306             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4307             return
4308
4309         video_id = mobj.group('video_id')
4310
4311         webpage = self._download_webpage(url, video_id)
4312
4313         m = re.search(r'file: "(.*?)",', webpage)
4314         if not m:
4315             self._downloader.report_error(u'unable to find video url')
4316             return
4317         video_url = m.group(1)
4318
4319         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4320         if not m:
4321             self._downloader.trouble(u'Cannot find video title')
4322         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4323
4324         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4325         if m:
4326             desc = unescapeHTML(m.group('desc'))
4327         else:
4328             desc = None
4329
4330         m = re.search(r'By:.*?(\w+)</a>', webpage)
4331         if m:
4332             uploader = clean_html(m.group(1))
4333         else:
4334             uploader = None
4335
4336         info = {
4337             'id':  video_id,
4338             'url': video_url,
4339             'ext': 'mp4',
4340             'title': title,
4341             'description': desc,
4342             'uploader': uploader
4343         }
4344
4345         return [info]
4346
4347
4348 def gen_extractors():
4349     """ Return a list of an instance of every supported extractor.
4350     The order does matter; the first extractor matched is the one handling the URL.
4351     """
4352     return [
4353         YoutubePlaylistIE(),
4354         YoutubeChannelIE(),
4355         YoutubeUserIE(),
4356         YoutubeSearchIE(),
4357         YoutubeIE(),
4358         MetacafeIE(),
4359         DailymotionIE(),
4360         GoogleSearchIE(),
4361         PhotobucketIE(),
4362         YahooIE(),
4363         YahooSearchIE(),
4364         DepositFilesIE(),
4365         FacebookIE(),
4366         BlipTVUserIE(),
4367         BlipTVIE(),
4368         VimeoIE(),
4369         MyVideoIE(),
4370         ComedyCentralIE(),
4371         EscapistIE(),
4372         CollegeHumorIE(),
4373         XVideosIE(),
4374         SoundcloudSetIE(),
4375         SoundcloudIE(),
4376         InfoQIE(),
4377         MixcloudIE(),
4378         StanfordOpenClassroomIE(),
4379         MTVIE(),
4380         YoukuIE(),
4381         XNXXIE(),
4382         YouJizzIE(),
4383         PornotubeIE(),
4384         YouPornIE(),
4385         GooglePlusIE(),
4386         ArteTvIE(),
4387         NBAIE(),
4388         WorldStarHipHopIE(),
4389         JustinTVIE(),
4390         FunnyOrDieIE(),
4391         SteamIE(),
4392         UstreamIE(),
4393         RBMARadioIE(),
4394         EightTracksIE(),
4395         KeekIE(),
4396         TEDIE(),
4397         MySpassIE(),
4398         SpiegelIE(),
4399         LiveLeakIE(),
4400         GenericIE()
4401     ]