c1c206a8a1cd4780d7b72730cfbf6a0d0775c0e5
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         return webpage_bytes.decode(encoding, 'replace')
138
139
140 class YoutubeIE(InfoExtractor):
141     """Information extractor for youtube.com."""
142
143     _VALID_URL = r"""^
144                      (
145                          (?:https?://)?                                       # http(s):// (optional)
146                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
148                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
149                          (?:                                                  # the various things that can precede the ID:
150                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
151                              |(?:                                             # or the v= param in all its forms
152                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
154                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
155                                  v=
156                              )
157                          )?                                                   # optional -> youtube.com/xxxx is OK
158                      )?                                                       # all until now is optional -> you can pass the naked ID
159                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
160                      (?(1).+)?                                                # if we found the ID, everything can follow
161                      $"""
162     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
163     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
164     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
165     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166     _NETRC_MACHINE = 'youtube'
167     # Listed in order of quality
168     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
169     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
170     _video_extensions = {
171         '13': '3gp',
172         '17': 'mp4',
173         '18': 'mp4',
174         '22': 'mp4',
175         '37': 'mp4',
176         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
177         '43': 'webm',
178         '44': 'webm',
179         '45': 'webm',
180         '46': 'webm',
181     }
182     _video_dimensions = {
183         '5': '240x400',
184         '6': '???',
185         '13': '???',
186         '17': '144x176',
187         '18': '360x640',
188         '22': '720x1280',
189         '34': '360x640',
190         '35': '480x854',
191         '37': '1080x1920',
192         '38': '3072x4096',
193         '43': '360x640',
194         '44': '480x854',
195         '45': '720x1280',
196         '46': '1080x1920',
197     }
198     IE_NAME = u'youtube'
199
200     @classmethod
201     def suitable(cls, url):
202         """Receives a URL and returns True if suitable for this IE."""
203         if YoutubePlaylistIE.suitable(url): return False
204         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
205
206     def report_lang(self):
207         """Report attempt to set language."""
208         self._downloader.to_screen(u'[youtube] Setting language')
209
210     def report_login(self):
211         """Report attempt to log in."""
212         self._downloader.to_screen(u'[youtube] Logging in')
213
214     def report_age_confirmation(self):
215         """Report attempt to confirm age."""
216         self._downloader.to_screen(u'[youtube] Confirming age')
217
218     def report_video_webpage_download(self, video_id):
219         """Report attempt to download video webpage."""
220         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
221
222     def report_video_info_webpage_download(self, video_id):
223         """Report attempt to download video info webpage."""
224         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
225
226     def report_video_subtitles_download(self, video_id):
227         """Report attempt to download video info webpage."""
228         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
229
230     def report_video_subtitles_request(self, video_id, sub_lang, format):
231         """Report attempt to download video info webpage."""
232         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
233
234     def report_video_subtitles_available(self, video_id, sub_lang_list):
235         """Report available subtitles."""
236         sub_lang = ",".join(list(sub_lang_list.keys()))
237         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
238
239     def report_information_extraction(self, video_id):
240         """Report attempt to extract video information."""
241         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
242
243     def report_unavailable_format(self, video_id, format):
244         """Report extracted video URL."""
245         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
246
247     def report_rtmp_download(self):
248         """Indicate the download will use the RTMP protocol."""
249         self._downloader.to_screen(u'[youtube] RTMP download detected')
250
251     def _get_available_subtitles(self, video_id):
252         self.report_video_subtitles_download(video_id)
253         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
254         try:
255             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257             return (u'unable to download video subtitles: %s' % compat_str(err), None)
258         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
259         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
260         if not sub_lang_list:
261             return (u'video doesn\'t have subtitles', None)
262         return sub_lang_list
263
264     def _list_available_subtitles(self, video_id):
265         sub_lang_list = self._get_available_subtitles(video_id)
266         self.report_video_subtitles_available(video_id, sub_lang_list)
267
268     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
269         """
270         Return tuple:
271         (error_message, sub_lang, sub)
272         """
273         self.report_video_subtitles_request(video_id, sub_lang, format)
274         params = compat_urllib_parse.urlencode({
275             'lang': sub_lang,
276             'name': sub_name,
277             'v': video_id,
278             'fmt': format,
279         })
280         url = 'http://www.youtube.com/api/timedtext?' + params
281         try:
282             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
283         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
284             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
285         if not sub:
286             return (u'Did not fetch video subtitles', None, None)
287         return (None, sub_lang, sub)
288
289     def _extract_subtitle(self, video_id):
290         """
291         Return a list with a tuple:
292         [(error_message, sub_lang, sub)]
293         """
294         sub_lang_list = self._get_available_subtitles(video_id)
295         sub_format = self._downloader.params.get('subtitlesformat')
296         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
297             return [(sub_lang_list[0], None, None)]
298         if self._downloader.params.get('subtitleslang', False):
299             sub_lang = self._downloader.params.get('subtitleslang')
300         elif 'en' in sub_lang_list:
301             sub_lang = 'en'
302         else:
303             sub_lang = list(sub_lang_list.keys())[0]
304         if not sub_lang in sub_lang_list:
305             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
306
307         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
308         return [subtitle]
309
310     def _extract_all_subtitles(self, video_id):
311         sub_lang_list = self._get_available_subtitles(video_id)
312         sub_format = self._downloader.params.get('subtitlesformat')
313         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
314             return [(sub_lang_list[0], None, None)]
315         subtitles = []
316         for sub_lang in sub_lang_list:
317             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
318             subtitles.append(subtitle)
319         return subtitles
320
321     def _print_formats(self, formats):
322         print('Available formats:')
323         for x in formats:
324             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
325
326     def _real_initialize(self):
327         if self._downloader is None:
328             return
329
330         username = None
331         password = None
332         downloader_params = self._downloader.params
333
334         # Attempt to use provided username and password or .netrc data
335         if downloader_params.get('username', None) is not None:
336             username = downloader_params['username']
337             password = downloader_params['password']
338         elif downloader_params.get('usenetrc', False):
339             try:
340                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
341                 if info is not None:
342                     username = info[0]
343                     password = info[2]
344                 else:
345                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
346             except (IOError, netrc.NetrcParseError) as err:
347                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
348                 return
349
350         # Set language
351         request = compat_urllib_request.Request(self._LANG_URL)
352         try:
353             self.report_lang()
354             compat_urllib_request.urlopen(request).read()
355         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
356             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
357             return
358
359         # No authentication to be performed
360         if username is None:
361             return
362
363         request = compat_urllib_request.Request(self._LOGIN_URL)
364         try:
365             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
367             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
368             return
369
370         galx = None
371         dsh = None
372         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
373         if match:
374           galx = match.group(1)
375
376         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
377         if match:
378           dsh = match.group(1)
379
380         # Log in
381         login_form_strs = {
382                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
383                 u'Email': username,
384                 u'GALX': galx,
385                 u'Passwd': password,
386                 u'PersistentCookie': u'yes',
387                 u'_utf8': u'霱',
388                 u'bgresponse': u'js_disabled',
389                 u'checkConnection': u'',
390                 u'checkedDomains': u'youtube',
391                 u'dnConn': u'',
392                 u'dsh': dsh,
393                 u'pstMsg': u'0',
394                 u'rmShown': u'1',
395                 u'secTok': u'',
396                 u'signIn': u'Sign in',
397                 u'timeStmp': u'',
398                 u'service': u'youtube',
399                 u'uilel': u'3',
400                 u'hl': u'en_US',
401         }
402         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
403         # chokes on unicode
404         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
405         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
406         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
407         try:
408             self.report_login()
409             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
410             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
411                 self._downloader.report_warning(u'unable to log in: bad username or password')
412                 return
413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
415             return
416
417         # Confirm age
418         age_form = {
419                 'next_url':     '/',
420                 'action_confirm':   'Confirm',
421                 }
422         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
423         try:
424             self.report_age_confirmation()
425             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
426         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
427             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
428             return
429
430     def _extract_id(self, url):
431         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
432         if mobj is None:
433             self._downloader.report_error(u'invalid URL: %s' % url)
434             return
435         video_id = mobj.group(2)
436         return video_id
437
438     def _real_extract(self, url):
439         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
440         mobj = re.search(self._NEXT_URL_RE, url)
441         if mobj:
442             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
443         video_id = self._extract_id(url)
444
445         # Get video webpage
446         self.report_video_webpage_download(video_id)
447         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
448         request = compat_urllib_request.Request(url)
449         try:
450             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
452             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
453             return
454
455         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
456
457         # Attempt to extract SWF player URL
458         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
459         if mobj is not None:
460             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
461         else:
462             player_url = None
463
464         # Get video info
465         self.report_video_info_webpage_download(video_id)
466         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
467             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
468                     % (video_id, el_type))
469             video_info_webpage = self._download_webpage(video_info_url, video_id,
470                                     note=False,
471                                     errnote='unable to download video info webpage')
472             video_info = compat_parse_qs(video_info_webpage)
473             if 'token' in video_info:
474                 break
475         if 'token' not in video_info:
476             if 'reason' in video_info:
477                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
478             else:
479                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
480             return
481
482         # Check for "rental" videos
483         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
484             self._downloader.report_error(u'"rental" videos not supported')
485             return
486
487         # Start extracting information
488         self.report_information_extraction(video_id)
489
490         # uploader
491         if 'author' not in video_info:
492             self._downloader.report_error(u'unable to extract uploader name')
493             return
494         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
495
496         # uploader_id
497         video_uploader_id = None
498         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
499         if mobj is not None:
500             video_uploader_id = mobj.group(1)
501         else:
502             self._downloader.report_warning(u'unable to extract uploader nickname')
503
504         # title
505         if 'title' not in video_info:
506             self._downloader.report_error(u'unable to extract video title')
507             return
508         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
509
510         # thumbnail image
511         if 'thumbnail_url' not in video_info:
512             self._downloader.report_warning(u'unable to extract video thumbnail')
513             video_thumbnail = ''
514         else:   # don't panic if we can't find it
515             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
516
517         # upload date
518         upload_date = None
519         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
520         if mobj is not None:
521             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
522             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
523             for expression in format_expressions:
524                 try:
525                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
526                 except:
527                     pass
528
529         # description
530         video_description = get_element_by_id("eow-description", video_webpage)
531         if video_description:
532             video_description = clean_html(video_description)
533         else:
534             video_description = ''
535
536         # subtitles
537         video_subtitles = None
538
539         if self._downloader.params.get('writesubtitles', False):
540             video_subtitles = self._extract_subtitle(video_id)
541             if video_subtitles:
542                 (sub_error, sub_lang, sub) = video_subtitles[0]
543                 if sub_error:
544                     self._downloader.report_error(sub_error)
545
546         if self._downloader.params.get('allsubtitles', False):
547             video_subtitles = self._extract_all_subtitles(video_id)
548             for video_subtitle in video_subtitles:
549                 (sub_error, sub_lang, sub) = video_subtitle
550                 if sub_error:
551                     self._downloader.report_error(sub_error)
552
553         if self._downloader.params.get('listsubtitles', False):
554             sub_lang_list = self._list_available_subtitles(video_id)
555             return
556
557         if 'length_seconds' not in video_info:
558             self._downloader.report_warning(u'unable to extract video duration')
559             video_duration = ''
560         else:
561             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
562
563         # token
564         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
565
566         # Decide which formats to download
567         req_format = self._downloader.params.get('format', None)
568
569         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
570             self.report_rtmp_download()
571             video_url_list = [(None, video_info['conn'][0])]
572         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
573             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
574             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
575             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
576             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
577
578             format_limit = self._downloader.params.get('format_limit', None)
579             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
580             if format_limit is not None and format_limit in available_formats:
581                 format_list = available_formats[available_formats.index(format_limit):]
582             else:
583                 format_list = available_formats
584             existing_formats = [x for x in format_list if x in url_map]
585             if len(existing_formats) == 0:
586                 self._downloader.report_error(u'no known formats available for video')
587                 return
588             if self._downloader.params.get('listformats', None):
589                 self._print_formats(existing_formats)
590                 return
591             if req_format is None or req_format == 'best':
592                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
593             elif req_format == 'worst':
594                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
595             elif req_format in ('-1', 'all'):
596                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
597             else:
598                 # Specific formats. We pick the first in a slash-delimeted sequence.
599                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
600                 req_formats = req_format.split('/')
601                 video_url_list = None
602                 for rf in req_formats:
603                     if rf in url_map:
604                         video_url_list = [(rf, url_map[rf])]
605                         break
606                 if video_url_list is None:
607                     self._downloader.report_error(u'requested format not available')
608                     return
609         else:
610             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
611             return
612
613         results = []
614         for format_param, video_real_url in video_url_list:
615             # Extension
616             video_extension = self._video_extensions.get(format_param, 'flv')
617
618             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
619                                               self._video_dimensions.get(format_param, '???'))
620
621             results.append({
622                 'id':       video_id,
623                 'url':      video_real_url,
624                 'uploader': video_uploader,
625                 'uploader_id': video_uploader_id,
626                 'upload_date':  upload_date,
627                 'title':    video_title,
628                 'ext':      video_extension,
629                 'format':   video_format,
630                 'thumbnail':    video_thumbnail,
631                 'description':  video_description,
632                 'player_url':   player_url,
633                 'subtitles':    video_subtitles,
634                 'duration':     video_duration
635             })
636         return results
637
638
639 class MetacafeIE(InfoExtractor):
640     """Information Extractor for metacafe.com."""
641
642     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
643     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
644     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
645     IE_NAME = u'metacafe'
646
647     def __init__(self, downloader=None):
648         InfoExtractor.__init__(self, downloader)
649
650     def report_disclaimer(self):
651         """Report disclaimer retrieval."""
652         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
653
654     def report_age_confirmation(self):
655         """Report attempt to confirm age."""
656         self._downloader.to_screen(u'[metacafe] Confirming age')
657
658     def report_download_webpage(self, video_id):
659         """Report webpage download."""
660         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
661
662     def report_extraction(self, video_id):
663         """Report information extraction."""
664         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
665
666     def _real_initialize(self):
667         # Retrieve disclaimer
668         request = compat_urllib_request.Request(self._DISCLAIMER)
669         try:
670             self.report_disclaimer()
671             disclaimer = compat_urllib_request.urlopen(request).read()
672         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
673             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
674             return
675
676         # Confirm age
677         disclaimer_form = {
678             'filters': '0',
679             'submit': "Continue - I'm over 18",
680             }
681         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
682         try:
683             self.report_age_confirmation()
684             disclaimer = compat_urllib_request.urlopen(request).read()
685         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
686             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
687             return
688
689     def _real_extract(self, url):
690         # Extract id and simplified title from URL
691         mobj = re.match(self._VALID_URL, url)
692         if mobj is None:
693             self._downloader.report_error(u'invalid URL: %s' % url)
694             return
695
696         video_id = mobj.group(1)
697
698         # Check if video comes from YouTube
699         mobj2 = re.match(r'^yt-(.*)$', video_id)
700         if mobj2 is not None:
701             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
702             return
703
704         # Retrieve video webpage to extract further information
705         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
706         try:
707             self.report_download_webpage(video_id)
708             webpage = compat_urllib_request.urlopen(request).read()
709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
711             return
712
713         # Extract URL, uploader and title from webpage
714         self.report_extraction(video_id)
715         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
716         if mobj is not None:
717             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
718             video_extension = mediaURL[-3:]
719
720             # Extract gdaKey if available
721             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
722             if mobj is None:
723                 video_url = mediaURL
724             else:
725                 gdaKey = mobj.group(1)
726                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
727         else:
728             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
729             if mobj is None:
730                 self._downloader.report_error(u'unable to extract media URL')
731                 return
732             vardict = compat_parse_qs(mobj.group(1))
733             if 'mediaData' not in vardict:
734                 self._downloader.report_error(u'unable to extract media URL')
735                 return
736             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
737             if mobj is None:
738                 self._downloader.report_error(u'unable to extract media URL')
739                 return
740             mediaURL = mobj.group(1).replace('\\/', '/')
741             video_extension = mediaURL[-3:]
742             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
743
744         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
745         if mobj is None:
746             self._downloader.report_error(u'unable to extract title')
747             return
748         video_title = mobj.group(1).decode('utf-8')
749
750         mobj = re.search(r'submitter=(.*?);', webpage)
751         if mobj is None:
752             self._downloader.report_error(u'unable to extract uploader nickname')
753             return
754         video_uploader = mobj.group(1)
755
756         return [{
757             'id':       video_id.decode('utf-8'),
758             'url':      video_url.decode('utf-8'),
759             'uploader': video_uploader.decode('utf-8'),
760             'upload_date':  None,
761             'title':    video_title,
762             'ext':      video_extension.decode('utf-8'),
763         }]
764
765
766 class DailymotionIE(InfoExtractor):
767     """Information Extractor for Dailymotion"""
768
769     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
770     IE_NAME = u'dailymotion'
771     _WORKING = False
772
773     def __init__(self, downloader=None):
774         InfoExtractor.__init__(self, downloader)
775
776     def report_extraction(self, video_id):
777         """Report information extraction."""
778         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
779
780     def _real_extract(self, url):
781         # Extract id and simplified title from URL
782         mobj = re.match(self._VALID_URL, url)
783         if mobj is None:
784             self._downloader.report_error(u'invalid URL: %s' % url)
785             return
786
787         video_id = mobj.group(1).split('_')[0].split('?')[0]
788
789         video_extension = 'mp4'
790
791         # Retrieve video webpage to extract further information
792         request = compat_urllib_request.Request(url)
793         request.add_header('Cookie', 'family_filter=off')
794         webpage = self._download_webpage(request, video_id)
795
796         # Extract URL, uploader and title from webpage
797         self.report_extraction(video_id)
798         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
799         if mobj is None:
800             self._downloader.report_error(u'unable to extract media URL')
801             return
802         flashvars = compat_urllib_parse.unquote(mobj.group(1))
803
804         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
805             if key in flashvars:
806                 max_quality = key
807                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
808                 break
809         else:
810             self._downloader.report_error(u'unable to extract video URL')
811             return
812
813         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
814         if mobj is None:
815             self._downloader.report_error(u'unable to extract video URL')
816             return
817
818         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
819
820         # TODO: support choosing qualities
821
822         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
823         if mobj is None:
824             self._downloader.report_error(u'unable to extract title')
825             return
826         video_title = unescapeHTML(mobj.group('title'))
827
828         video_uploader = None
829         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
830         if mobj is None:
831             # lookin for official user
832             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
833             if mobj_official is None:
834                 self._downloader.report_warning(u'unable to extract uploader nickname')
835             else:
836                 video_uploader = mobj_official.group(1)
837         else:
838             video_uploader = mobj.group(1)
839
840         video_upload_date = None
841         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
842         if mobj is not None:
843             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
844
845         return [{
846             'id':       video_id,
847             'url':      video_url,
848             'uploader': video_uploader,
849             'upload_date':  video_upload_date,
850             'title':    video_title,
851             'ext':      video_extension,
852         }]
853
854
855 class PhotobucketIE(InfoExtractor):
856     """Information extractor for photobucket.com."""
857
858     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
859     IE_NAME = u'photobucket'
860
861     def __init__(self, downloader=None):
862         InfoExtractor.__init__(self, downloader)
863
864     def report_download_webpage(self, video_id):
865         """Report webpage download."""
866         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
867
868     def report_extraction(self, video_id):
869         """Report information extraction."""
870         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
871
872     def _real_extract(self, url):
873         # Extract id from URL
874         mobj = re.match(self._VALID_URL, url)
875         if mobj is None:
876             self._downloader.report_error(u'Invalid URL: %s' % url)
877             return
878
879         video_id = mobj.group(1)
880
881         video_extension = 'flv'
882
883         # Retrieve video webpage to extract further information
884         request = compat_urllib_request.Request(url)
885         try:
886             self.report_download_webpage(video_id)
887             webpage = compat_urllib_request.urlopen(request).read()
888         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
889             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
890             return
891
892         # Extract URL, uploader, and title from webpage
893         self.report_extraction(video_id)
894         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
895         if mobj is None:
896             self._downloader.report_error(u'unable to extract media URL')
897             return
898         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
899
900         video_url = mediaURL
901
902         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
903         if mobj is None:
904             self._downloader.report_error(u'unable to extract title')
905             return
906         video_title = mobj.group(1).decode('utf-8')
907
908         video_uploader = mobj.group(2).decode('utf-8')
909
910         return [{
911             'id':       video_id.decode('utf-8'),
912             'url':      video_url.decode('utf-8'),
913             'uploader': video_uploader,
914             'upload_date':  None,
915             'title':    video_title,
916             'ext':      video_extension.decode('utf-8'),
917         }]
918
919
920 class YahooIE(InfoExtractor):
921     """Information extractor for video.yahoo.com."""
922
923     _WORKING = False
924     # _VALID_URL matches all Yahoo! Video URLs
925     # _VPAGE_URL matches only the extractable '/watch/' URLs
926     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
927     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
928     IE_NAME = u'video.yahoo'
929
930     def __init__(self, downloader=None):
931         InfoExtractor.__init__(self, downloader)
932
933     def report_download_webpage(self, video_id):
934         """Report webpage download."""
935         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
936
937     def report_extraction(self, video_id):
938         """Report information extraction."""
939         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
940
941     def _real_extract(self, url, new_video=True):
942         # Extract ID from URL
943         mobj = re.match(self._VALID_URL, url)
944         if mobj is None:
945             self._downloader.report_error(u'Invalid URL: %s' % url)
946             return
947
948         video_id = mobj.group(2)
949         video_extension = 'flv'
950
951         # Rewrite valid but non-extractable URLs as
952         # extractable English language /watch/ URLs
953         if re.match(self._VPAGE_URL, url) is None:
954             request = compat_urllib_request.Request(url)
955             try:
956                 webpage = compat_urllib_request.urlopen(request).read()
957             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
958                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
959                 return
960
961             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
962             if mobj is None:
963                 self._downloader.report_error(u'Unable to extract id field')
964                 return
965             yahoo_id = mobj.group(1)
966
967             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
968             if mobj is None:
969                 self._downloader.report_error(u'Unable to extract vid field')
970                 return
971             yahoo_vid = mobj.group(1)
972
973             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
974             return self._real_extract(url, new_video=False)
975
976         # Retrieve video webpage to extract further information
977         request = compat_urllib_request.Request(url)
978         try:
979             self.report_download_webpage(video_id)
980             webpage = compat_urllib_request.urlopen(request).read()
981         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
983             return
984
985         # Extract uploader and title from webpage
986         self.report_extraction(video_id)
987         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
988         if mobj is None:
989             self._downloader.report_error(u'unable to extract video title')
990             return
991         video_title = mobj.group(1).decode('utf-8')
992
993         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
994         if mobj is None:
995             self._downloader.report_error(u'unable to extract video uploader')
996             return
997         video_uploader = mobj.group(1).decode('utf-8')
998
999         # Extract video thumbnail
1000         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.report_error(u'unable to extract video thumbnail')
1003             return
1004         video_thumbnail = mobj.group(1).decode('utf-8')
1005
1006         # Extract video description
1007         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video description')
1010             return
1011         video_description = mobj.group(1).decode('utf-8')
1012         if not video_description:
1013             video_description = 'No description available.'
1014
1015         # Extract video height and width
1016         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1017         if mobj is None:
1018             self._downloader.report_error(u'unable to extract video height')
1019             return
1020         yv_video_height = mobj.group(1)
1021
1022         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1023         if mobj is None:
1024             self._downloader.report_error(u'unable to extract video width')
1025             return
1026         yv_video_width = mobj.group(1)
1027
1028         # Retrieve video playlist to extract media URL
1029         # I'm not completely sure what all these options are, but we
1030         # seem to need most of them, otherwise the server sends a 401.
1031         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1032         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1033         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1034                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1035                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1036         try:
1037             self.report_download_webpage(video_id)
1038             webpage = compat_urllib_request.urlopen(request).read()
1039         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1040             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1041             return
1042
1043         # Extract media URL from playlist XML
1044         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1045         if mobj is None:
1046             self._downloader.report_error(u'Unable to extract media URL')
1047             return
1048         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1049         video_url = unescapeHTML(video_url)
1050
1051         return [{
1052             'id':       video_id.decode('utf-8'),
1053             'url':      video_url,
1054             'uploader': video_uploader,
1055             'upload_date':  None,
1056             'title':    video_title,
1057             'ext':      video_extension.decode('utf-8'),
1058             'thumbnail':    video_thumbnail.decode('utf-8'),
1059             'description':  video_description,
1060         }]
1061
1062
1063 class VimeoIE(InfoExtractor):
1064     """Information extractor for vimeo.com."""
1065
1066     # _VALID_URL matches Vimeo URLs
1067     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1068     IE_NAME = u'vimeo'
1069
1070     def __init__(self, downloader=None):
1071         InfoExtractor.__init__(self, downloader)
1072
1073     def report_download_webpage(self, video_id):
1074         """Report webpage download."""
1075         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1076
1077     def report_extraction(self, video_id):
1078         """Report information extraction."""
1079         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1080
1081     def _real_extract(self, url, new_video=True):
1082         # Extract ID from URL
1083         mobj = re.match(self._VALID_URL, url)
1084         if mobj is None:
1085             self._downloader.report_error(u'Invalid URL: %s' % url)
1086             return
1087
1088         video_id = mobj.group('id')
1089         if not mobj.group('proto'):
1090             url = 'https://' + url
1091         if mobj.group('direct_link'):
1092             url = 'https://vimeo.com/' + video_id
1093
1094         # Retrieve video webpage to extract further information
1095         request = compat_urllib_request.Request(url, None, std_headers)
1096         try:
1097             self.report_download_webpage(video_id)
1098             webpage_bytes = compat_urllib_request.urlopen(request).read()
1099             webpage = webpage_bytes.decode('utf-8')
1100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1102             return
1103
1104         # Now we begin extracting as much information as we can from what we
1105         # retrieved. First we extract the information common to all extractors,
1106         # and latter we extract those that are Vimeo specific.
1107         self.report_extraction(video_id)
1108
1109         # Extract the config JSON
1110         try:
1111             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1112             config = json.loads(config)
1113         except:
1114             self._downloader.report_error(u'unable to extract info section')
1115             return
1116
1117         # Extract title
1118         video_title = config["video"]["title"]
1119
1120         # Extract uploader and uploader_id
1121         video_uploader = config["video"]["owner"]["name"]
1122         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1123
1124         # Extract video thumbnail
1125         video_thumbnail = config["video"]["thumbnail"]
1126
1127         # Extract video description
1128         video_description = get_element_by_attribute("itemprop", "description", webpage)
1129         if video_description: video_description = clean_html(video_description)
1130         else: video_description = u''
1131
1132         # Extract upload date
1133         video_upload_date = None
1134         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1135         if mobj is not None:
1136             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1137
1138         # Vimeo specific: extract request signature and timestamp
1139         sig = config['request']['signature']
1140         timestamp = config['request']['timestamp']
1141
1142         # Vimeo specific: extract video codec and quality information
1143         # First consider quality, then codecs, then take everything
1144         # TODO bind to format param
1145         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1146         files = { 'hd': [], 'sd': [], 'other': []}
1147         for codec_name, codec_extension in codecs:
1148             if codec_name in config["video"]["files"]:
1149                 if 'hd' in config["video"]["files"][codec_name]:
1150                     files['hd'].append((codec_name, codec_extension, 'hd'))
1151                 elif 'sd' in config["video"]["files"][codec_name]:
1152                     files['sd'].append((codec_name, codec_extension, 'sd'))
1153                 else:
1154                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1155
1156         for quality in ('hd', 'sd', 'other'):
1157             if len(files[quality]) > 0:
1158                 video_quality = files[quality][0][2]
1159                 video_codec = files[quality][0][0]
1160                 video_extension = files[quality][0][1]
1161                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1162                 break
1163         else:
1164             self._downloader.report_error(u'no known codec found')
1165             return
1166
1167         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1168                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1169
1170         return [{
1171             'id':       video_id,
1172             'url':      video_url,
1173             'uploader': video_uploader,
1174             'uploader_id': video_uploader_id,
1175             'upload_date':  video_upload_date,
1176             'title':    video_title,
1177             'ext':      video_extension,
1178             'thumbnail':    video_thumbnail,
1179             'description':  video_description,
1180         }]
1181
1182
1183 class ArteTvIE(InfoExtractor):
1184     """arte.tv information extractor."""
1185
1186     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1187     _LIVE_URL = r'index-[0-9]+\.html$'
1188
1189     IE_NAME = u'arte.tv'
1190
1191     def __init__(self, downloader=None):
1192         InfoExtractor.__init__(self, downloader)
1193
1194     def report_download_webpage(self, video_id):
1195         """Report webpage download."""
1196         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1197
1198     def report_extraction(self, video_id):
1199         """Report information extraction."""
1200         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1201
1202     def fetch_webpage(self, url):
1203         request = compat_urllib_request.Request(url)
1204         try:
1205             self.report_download_webpage(url)
1206             webpage = compat_urllib_request.urlopen(request).read()
1207         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1208             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1209             return
1210         except ValueError as err:
1211             self._downloader.report_error(u'Invalid URL: %s' % url)
1212             return
1213         return webpage
1214
1215     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1216         page = self.fetch_webpage(url)
1217         mobj = re.search(regex, page, regexFlags)
1218         info = {}
1219
1220         if mobj is None:
1221             self._downloader.report_error(u'Invalid URL: %s' % url)
1222             return
1223
1224         for (i, key, err) in matchTuples:
1225             if mobj.group(i) is None:
1226                 self._downloader.trouble(err)
1227                 return
1228             else:
1229                 info[key] = mobj.group(i)
1230
1231         return info
1232
1233     def extractLiveStream(self, url):
1234         video_lang = url.split('/')[-4]
1235         info = self.grep_webpage(
1236             url,
1237             r'src="(.*?/videothek_js.*?\.js)',
1238             0,
1239             [
1240                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1241             ]
1242         )
1243         http_host = url.split('/')[2]
1244         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245         info = self.grep_webpage(
1246             next_url,
1247             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248                 '(http://.*?\.swf).*?' +
1249                 '(rtmp://.*?)\'',
1250             re.DOTALL,
1251             [
1252                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1253                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1254                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1255             ]
1256         )
1257         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1258
1259     def extractPlus7Stream(self, url):
1260         video_lang = url.split('/')[-3]
1261         info = self.grep_webpage(
1262             url,
1263             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1264             0,
1265             [
1266                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270         info = self.grep_webpage(
1271             next_url,
1272             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1273             0,
1274             [
1275                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1276             ]
1277         )
1278         next_url = compat_urllib_parse.unquote(info.get('url'))
1279
1280         info = self.grep_webpage(
1281             next_url,
1282             r'<video id="(.*?)".*?>.*?' +
1283                 '<name>(.*?)</name>.*?' +
1284                 '<dateVideo>(.*?)</dateVideo>.*?' +
1285                 '<url quality="hd">(.*?)</url>',
1286             re.DOTALL,
1287             [
1288                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1289                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1290                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1291                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1292             ]
1293         )
1294
1295         return {
1296             'id':           info.get('id'),
1297             'url':          compat_urllib_parse.unquote(info.get('url')),
1298             'uploader':     u'arte.tv',
1299             'upload_date':  info.get('date'),
1300             'title':        info.get('title').decode('utf-8'),
1301             'ext':          u'mp4',
1302             'format':       u'NA',
1303             'player_url':   None,
1304         }
1305
1306     def _real_extract(self, url):
1307         video_id = url.split('/')[-1]
1308         self.report_extraction(video_id)
1309
1310         if re.search(self._LIVE_URL, video_id) is not None:
1311             self.extractLiveStream(url)
1312             return
1313         else:
1314             info = self.extractPlus7Stream(url)
1315
1316         return [info]
1317
1318
1319 class GenericIE(InfoExtractor):
1320     """Generic last-resort information extractor."""
1321
1322     _VALID_URL = r'.*'
1323     IE_NAME = u'generic'
1324
1325     def __init__(self, downloader=None):
1326         InfoExtractor.__init__(self, downloader)
1327
1328     def report_download_webpage(self, video_id):
1329         """Report webpage download."""
1330         if not self._downloader.params.get('test', False):
1331             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1332         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1333
1334     def report_extraction(self, video_id):
1335         """Report information extraction."""
1336         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1337
1338     def report_following_redirect(self, new_url):
1339         """Report information extraction."""
1340         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1341
1342     def _test_redirect(self, url):
1343         """Check if it is a redirect, like url shorteners, in case restart chain."""
1344         class HeadRequest(compat_urllib_request.Request):
1345             def get_method(self):
1346                 return "HEAD"
1347
1348         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1349             """
1350             Subclass the HTTPRedirectHandler to make it use our
1351             HeadRequest also on the redirected URL
1352             """
1353             def redirect_request(self, req, fp, code, msg, headers, newurl):
1354                 if code in (301, 302, 303, 307):
1355                     newurl = newurl.replace(' ', '%20')
1356                     newheaders = dict((k,v) for k,v in req.headers.items()
1357                                       if k.lower() not in ("content-length", "content-type"))
1358                     return HeadRequest(newurl,
1359                                        headers=newheaders,
1360                                        origin_req_host=req.get_origin_req_host(),
1361                                        unverifiable=True)
1362                 else:
1363                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1364
1365         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1366             """
1367             Fallback to GET if HEAD is not allowed (405 HTTP error)
1368             """
1369             def http_error_405(self, req, fp, code, msg, headers):
1370                 fp.read()
1371                 fp.close()
1372
1373                 newheaders = dict((k,v) for k,v in req.headers.items()
1374                                   if k.lower() not in ("content-length", "content-type"))
1375                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1376                                                  headers=newheaders,
1377                                                  origin_req_host=req.get_origin_req_host(),
1378                                                  unverifiable=True))
1379
1380         # Build our opener
1381         opener = compat_urllib_request.OpenerDirector()
1382         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1383                         HTTPMethodFallback, HEADRedirectHandler,
1384                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1385             opener.add_handler(handler())
1386
1387         response = opener.open(HeadRequest(url))
1388         new_url = response.geturl()
1389
1390         if url == new_url:
1391             return False
1392
1393         self.report_following_redirect(new_url)
1394         self._downloader.download([new_url])
1395         return True
1396
1397     def _real_extract(self, url):
1398         if self._test_redirect(url): return
1399
1400         video_id = url.split('/')[-1]
1401         try:
1402             webpage = self._download_webpage(url, video_id)
1403         except ValueError as err:
1404             # since this is the last-resort InfoExtractor, if
1405             # this error is thrown, it'll be thrown here
1406             self._downloader.report_error(u'Invalid URL: %s' % url)
1407             return
1408
1409         self.report_extraction(video_id)
1410         # Start with something easy: JW Player in SWFObject
1411         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             # Broaden the search a little bit
1414             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1415         if mobj is None:
1416             # Broaden the search a little bit: JWPlayer JS loader
1417             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1418         if mobj is None:
1419             self._downloader.report_error(u'Invalid URL: %s' % url)
1420             return
1421
1422         # It's possible that one of the regexes
1423         # matched, but returned an empty group:
1424         if mobj.group(1) is None:
1425             self._downloader.report_error(u'Invalid URL: %s' % url)
1426             return
1427
1428         video_url = compat_urllib_parse.unquote(mobj.group(1))
1429         video_id = os.path.basename(video_url)
1430
1431         # here's a fun little line of code for you:
1432         video_extension = os.path.splitext(video_id)[1][1:]
1433         video_id = os.path.splitext(video_id)[0]
1434
1435         # it's tempting to parse this further, but you would
1436         # have to take into account all the variations like
1437         #   Video Title - Site Name
1438         #   Site Name | Video Title
1439         #   Video Title - Tagline | Site Name
1440         # and so on and so forth; it's just not practical
1441         mobj = re.search(r'<title>(.*)</title>', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'unable to extract title')
1444             return
1445         video_title = mobj.group(1)
1446
1447         # video uploader is domain name
1448         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1449         if mobj is None:
1450             self._downloader.report_error(u'unable to extract title')
1451             return
1452         video_uploader = mobj.group(1)
1453
1454         return [{
1455             'id':       video_id,
1456             'url':      video_url,
1457             'uploader': video_uploader,
1458             'upload_date':  None,
1459             'title':    video_title,
1460             'ext':      video_extension,
1461         }]
1462
1463
1464 class YoutubeSearchIE(InfoExtractor):
1465     """Information Extractor for YouTube search queries."""
1466     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1467     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1468     _max_youtube_results = 1000
1469     IE_NAME = u'youtube:search'
1470
1471     def __init__(self, downloader=None):
1472         InfoExtractor.__init__(self, downloader)
1473
1474     def report_download_page(self, query, pagenum):
1475         """Report attempt to download search page with given number."""
1476         query = query.decode(preferredencoding())
1477         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1478
1479     def _real_extract(self, query):
1480         mobj = re.match(self._VALID_URL, query)
1481         if mobj is None:
1482             self._downloader.report_error(u'invalid search query "%s"' % query)
1483             return
1484
1485         prefix, query = query.split(':')
1486         prefix = prefix[8:]
1487         query = query.encode('utf-8')
1488         if prefix == '':
1489             self._download_n_results(query, 1)
1490             return
1491         elif prefix == 'all':
1492             self._download_n_results(query, self._max_youtube_results)
1493             return
1494         else:
1495             try:
1496                 n = int(prefix)
1497                 if n <= 0:
1498                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1499                     return
1500                 elif n > self._max_youtube_results:
1501                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1502                     n = self._max_youtube_results
1503                 self._download_n_results(query, n)
1504                 return
1505             except ValueError: # parsing prefix as integer fails
1506                 self._download_n_results(query, 1)
1507                 return
1508
1509     def _download_n_results(self, query, n):
1510         """Downloads a specified number of results for a query"""
1511
1512         video_ids = []
1513         pagenum = 0
1514         limit = n
1515
1516         while (50 * pagenum) < limit:
1517             self.report_download_page(query, pagenum+1)
1518             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1519             request = compat_urllib_request.Request(result_url)
1520             try:
1521                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1522             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1523                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1524                 return
1525             api_response = json.loads(data)['data']
1526
1527             if not 'items' in api_response:
1528                 self._downloader.trouble(u'[youtube] No video results')
1529                 return
1530
1531             new_ids = list(video['id'] for video in api_response['items'])
1532             video_ids += new_ids
1533
1534             limit = min(n, api_response['totalItems'])
1535             pagenum += 1
1536
1537         if len(video_ids) > n:
1538             video_ids = video_ids[:n]
1539         for id in video_ids:
1540             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1541         return
1542
1543
1544 class GoogleSearchIE(InfoExtractor):
1545     """Information Extractor for Google Video search queries."""
1546     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1547     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1548     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1549     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1550     _max_google_results = 1000
1551     IE_NAME = u'video.google:search'
1552
1553     def __init__(self, downloader=None):
1554         InfoExtractor.__init__(self, downloader)
1555
1556     def report_download_page(self, query, pagenum):
1557         """Report attempt to download playlist page with given number."""
1558         query = query.decode(preferredencoding())
1559         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1560
1561     def _real_extract(self, query):
1562         mobj = re.match(self._VALID_URL, query)
1563         if mobj is None:
1564             self._downloader.report_error(u'invalid search query "%s"' % query)
1565             return
1566
1567         prefix, query = query.split(':')
1568         prefix = prefix[8:]
1569         query = query.encode('utf-8')
1570         if prefix == '':
1571             self._download_n_results(query, 1)
1572             return
1573         elif prefix == 'all':
1574             self._download_n_results(query, self._max_google_results)
1575             return
1576         else:
1577             try:
1578                 n = int(prefix)
1579                 if n <= 0:
1580                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1581                     return
1582                 elif n > self._max_google_results:
1583                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1584                     n = self._max_google_results
1585                 self._download_n_results(query, n)
1586                 return
1587             except ValueError: # parsing prefix as integer fails
1588                 self._download_n_results(query, 1)
1589                 return
1590
1591     def _download_n_results(self, query, n):
1592         """Downloads a specified number of results for a query"""
1593
1594         video_ids = []
1595         pagenum = 0
1596
1597         while True:
1598             self.report_download_page(query, pagenum)
1599             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1600             request = compat_urllib_request.Request(result_url)
1601             try:
1602                 page = compat_urllib_request.urlopen(request).read()
1603             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1604                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1605                 return
1606
1607             # Extract video identifiers
1608             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1609                 video_id = mobj.group(1)
1610                 if video_id not in video_ids:
1611                     video_ids.append(video_id)
1612                     if len(video_ids) == n:
1613                         # Specified n videos reached
1614                         for id in video_ids:
1615                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1616                         return
1617
1618             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1619                 for id in video_ids:
1620                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1621                 return
1622
1623             pagenum = pagenum + 1
1624
1625
1626 class YahooSearchIE(InfoExtractor):
1627     """Information Extractor for Yahoo! Video search queries."""
1628
1629     _WORKING = False
1630     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1631     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1632     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1633     _MORE_PAGES_INDICATOR = r'\s*Next'
1634     _max_yahoo_results = 1000
1635     IE_NAME = u'video.yahoo:search'
1636
1637     def __init__(self, downloader=None):
1638         InfoExtractor.__init__(self, downloader)
1639
1640     def report_download_page(self, query, pagenum):
1641         """Report attempt to download playlist page with given number."""
1642         query = query.decode(preferredencoding())
1643         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1644
1645     def _real_extract(self, query):
1646         mobj = re.match(self._VALID_URL, query)
1647         if mobj is None:
1648             self._downloader.report_error(u'invalid search query "%s"' % query)
1649             return
1650
1651         prefix, query = query.split(':')
1652         prefix = prefix[8:]
1653         query = query.encode('utf-8')
1654         if prefix == '':
1655             self._download_n_results(query, 1)
1656             return
1657         elif prefix == 'all':
1658             self._download_n_results(query, self._max_yahoo_results)
1659             return
1660         else:
1661             try:
1662                 n = int(prefix)
1663                 if n <= 0:
1664                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1665                     return
1666                 elif n > self._max_yahoo_results:
1667                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1668                     n = self._max_yahoo_results
1669                 self._download_n_results(query, n)
1670                 return
1671             except ValueError: # parsing prefix as integer fails
1672                 self._download_n_results(query, 1)
1673                 return
1674
1675     def _download_n_results(self, query, n):
1676         """Downloads a specified number of results for a query"""
1677
1678         video_ids = []
1679         already_seen = set()
1680         pagenum = 1
1681
1682         while True:
1683             self.report_download_page(query, pagenum)
1684             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1685             request = compat_urllib_request.Request(result_url)
1686             try:
1687                 page = compat_urllib_request.urlopen(request).read()
1688             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1689                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1690                 return
1691
1692             # Extract video identifiers
1693             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1694                 video_id = mobj.group(1)
1695                 if video_id not in already_seen:
1696                     video_ids.append(video_id)
1697                     already_seen.add(video_id)
1698                     if len(video_ids) == n:
1699                         # Specified n videos reached
1700                         for id in video_ids:
1701                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1702                         return
1703
1704             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1705                 for id in video_ids:
1706                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1707                 return
1708
1709             pagenum = pagenum + 1
1710
1711
1712 class YoutubePlaylistIE(InfoExtractor):
1713     """Information Extractor for YouTube playlists."""
1714
1715     _VALID_URL = r"""(?:
1716                         (?:https?://)?
1717                         (?:\w+\.)?
1718                         youtube\.com/
1719                         (?:
1720                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1721                            \? (?:.*?&)*? (?:p|a|list)=
1722                         |  p/
1723                         )
1724                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1725                         .*
1726                      |
1727                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1728                      )"""
1729     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1730     _MAX_RESULTS = 50
1731     IE_NAME = u'youtube:playlist'
1732
1733     def __init__(self, downloader=None):
1734         InfoExtractor.__init__(self, downloader)
1735
1736     @classmethod
1737     def suitable(cls, url):
1738         """Receives a URL and returns True if suitable for this IE."""
1739         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1740
1741     def report_download_page(self, playlist_id, pagenum):
1742         """Report attempt to download playlist page with given number."""
1743         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1744
1745     def _real_extract(self, url):
1746         # Extract playlist id
1747         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1748         if mobj is None:
1749             self._downloader.report_error(u'invalid url: %s' % url)
1750             return
1751
1752         # Download playlist videos from API
1753         playlist_id = mobj.group(1) or mobj.group(2)
1754         page_num = 1
1755         videos = []
1756
1757         while True:
1758             self.report_download_page(playlist_id, page_num)
1759
1760             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1761             try:
1762                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1763             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1764                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1765                 return
1766
1767             try:
1768                 response = json.loads(page)
1769             except ValueError as err:
1770                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1771                 return
1772
1773             if not 'feed' in response or not 'entry' in response['feed']:
1774                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1775                 return
1776             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1777                         for entry in response['feed']['entry']
1778                         if 'content' in entry ]
1779
1780             if len(response['feed']['entry']) < self._MAX_RESULTS:
1781                 break
1782             page_num += 1
1783
1784         videos = [v[1] for v in sorted(videos)]
1785         total = len(videos)
1786
1787         playliststart = self._downloader.params.get('playliststart', 1) - 1
1788         playlistend = self._downloader.params.get('playlistend', -1)
1789         if playlistend == -1:
1790             videos = videos[playliststart:]
1791         else:
1792             videos = videos[playliststart:playlistend]
1793
1794         if len(videos) == total:
1795             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1796         else:
1797             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1798
1799         for video in videos:
1800             self._downloader.download([video])
1801         return
1802
1803
1804 class YoutubeChannelIE(InfoExtractor):
1805     """Information Extractor for YouTube channels."""
1806
1807     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1808     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1809     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1810     IE_NAME = u'youtube:channel'
1811
1812     def report_download_page(self, channel_id, pagenum):
1813         """Report attempt to download channel page with given number."""
1814         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1815
1816     def _real_extract(self, url):
1817         # Extract channel id
1818         mobj = re.match(self._VALID_URL, url)
1819         if mobj is None:
1820             self._downloader.report_error(u'invalid url: %s' % url)
1821             return
1822
1823         # Download channel pages
1824         channel_id = mobj.group(1)
1825         video_ids = []
1826         pagenum = 1
1827
1828         while True:
1829             self.report_download_page(channel_id, pagenum)
1830             url = self._TEMPLATE_URL % (channel_id, pagenum)
1831             request = compat_urllib_request.Request(url)
1832             try:
1833                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1834             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1836                 return
1837
1838             # Extract video identifiers
1839             ids_in_page = []
1840             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1841                 if mobj.group(1) not in ids_in_page:
1842                     ids_in_page.append(mobj.group(1))
1843             video_ids.extend(ids_in_page)
1844
1845             if self._MORE_PAGES_INDICATOR not in page:
1846                 break
1847             pagenum = pagenum + 1
1848
1849         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1850
1851         for id in video_ids:
1852             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1853         return
1854
1855
1856 class YoutubeUserIE(InfoExtractor):
1857     """Information Extractor for YouTube users."""
1858
1859     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1860     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1861     _GDATA_PAGE_SIZE = 50
1862     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1863     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1864     IE_NAME = u'youtube:user'
1865
1866     def __init__(self, downloader=None):
1867         InfoExtractor.__init__(self, downloader)
1868
1869     def report_download_page(self, username, start_index):
1870         """Report attempt to download user page."""
1871         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1872                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1873
1874     def _real_extract(self, url):
1875         # Extract username
1876         mobj = re.match(self._VALID_URL, url)
1877         if mobj is None:
1878             self._downloader.report_error(u'invalid url: %s' % url)
1879             return
1880
1881         username = mobj.group(1)
1882
1883         # Download video ids using YouTube Data API. Result size per
1884         # query is limited (currently to 50 videos) so we need to query
1885         # page by page until there are no video ids - it means we got
1886         # all of them.
1887
1888         video_ids = []
1889         pagenum = 0
1890
1891         while True:
1892             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1893             self.report_download_page(username, start_index)
1894
1895             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1896
1897             try:
1898                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1899             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1900                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1901                 return
1902
1903             # Extract video identifiers
1904             ids_in_page = []
1905
1906             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1907                 if mobj.group(1) not in ids_in_page:
1908                     ids_in_page.append(mobj.group(1))
1909
1910             video_ids.extend(ids_in_page)
1911
1912             # A little optimization - if current page is not
1913             # "full", ie. does not contain PAGE_SIZE video ids then
1914             # we can assume that this page is the last one - there
1915             # are no more ids on further pages - no need to query
1916             # again.
1917
1918             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1919                 break
1920
1921             pagenum += 1
1922
1923         all_ids_count = len(video_ids)
1924         playliststart = self._downloader.params.get('playliststart', 1) - 1
1925         playlistend = self._downloader.params.get('playlistend', -1)
1926
1927         if playlistend == -1:
1928             video_ids = video_ids[playliststart:]
1929         else:
1930             video_ids = video_ids[playliststart:playlistend]
1931
1932         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1933                 (username, all_ids_count, len(video_ids)))
1934
1935         for video_id in video_ids:
1936             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1937
1938
1939 class BlipTVUserIE(InfoExtractor):
1940     """Information Extractor for blip.tv users."""
1941
1942     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1943     _PAGE_SIZE = 12
1944     IE_NAME = u'blip.tv:user'
1945
1946     def __init__(self, downloader=None):
1947         InfoExtractor.__init__(self, downloader)
1948
1949     def report_download_page(self, username, pagenum):
1950         """Report attempt to download user page."""
1951         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1952                 (self.IE_NAME, username, pagenum))
1953
1954     def _real_extract(self, url):
1955         # Extract username
1956         mobj = re.match(self._VALID_URL, url)
1957         if mobj is None:
1958             self._downloader.report_error(u'invalid url: %s' % url)
1959             return
1960
1961         username = mobj.group(1)
1962
1963         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1964
1965         request = compat_urllib_request.Request(url)
1966
1967         try:
1968             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1969             mobj = re.search(r'data-users-id="([^"]+)"', page)
1970             page_base = page_base % mobj.group(1)
1971         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1972             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1973             return
1974
1975
1976         # Download video ids using BlipTV Ajax calls. Result size per
1977         # query is limited (currently to 12 videos) so we need to query
1978         # page by page until there are no video ids - it means we got
1979         # all of them.
1980
1981         video_ids = []
1982         pagenum = 1
1983
1984         while True:
1985             self.report_download_page(username, pagenum)
1986             url = page_base + "&page=" + str(pagenum)
1987             request = compat_urllib_request.Request( url )
1988             try:
1989                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1990             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1991                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1992                 return
1993
1994             # Extract video identifiers
1995             ids_in_page = []
1996
1997             for mobj in re.finditer(r'href="/([^"]+)"', page):
1998                 if mobj.group(1) not in ids_in_page:
1999                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2000
2001             video_ids.extend(ids_in_page)
2002
2003             # A little optimization - if current page is not
2004             # "full", ie. does not contain PAGE_SIZE video ids then
2005             # we can assume that this page is the last one - there
2006             # are no more ids on further pages - no need to query
2007             # again.
2008
2009             if len(ids_in_page) < self._PAGE_SIZE:
2010                 break
2011
2012             pagenum += 1
2013
2014         all_ids_count = len(video_ids)
2015         playliststart = self._downloader.params.get('playliststart', 1) - 1
2016         playlistend = self._downloader.params.get('playlistend', -1)
2017
2018         if playlistend == -1:
2019             video_ids = video_ids[playliststart:]
2020         else:
2021             video_ids = video_ids[playliststart:playlistend]
2022
2023         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2024                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2025
2026         for video_id in video_ids:
2027             self._downloader.download([u'http://blip.tv/'+video_id])
2028
2029
2030 class DepositFilesIE(InfoExtractor):
2031     """Information extractor for depositfiles.com"""
2032
2033     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2034
2035     def report_download_webpage(self, file_id):
2036         """Report webpage download."""
2037         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2038
2039     def report_extraction(self, file_id):
2040         """Report information extraction."""
2041         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2042
2043     def _real_extract(self, url):
2044         file_id = url.split('/')[-1]
2045         # Rebuild url in english locale
2046         url = 'http://depositfiles.com/en/files/' + file_id
2047
2048         # Retrieve file webpage with 'Free download' button pressed
2049         free_download_indication = { 'gateway_result' : '1' }
2050         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2051         try:
2052             self.report_download_webpage(file_id)
2053             webpage = compat_urllib_request.urlopen(request).read()
2054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2056             return
2057
2058         # Search for the real file URL
2059         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2060         if (mobj is None) or (mobj.group(1) is None):
2061             # Try to figure out reason of the error.
2062             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2063             if (mobj is not None) and (mobj.group(1) is not None):
2064                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2065                 self._downloader.report_error(u'%s' % restriction_message)
2066             else:
2067                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2068             return
2069
2070         file_url = mobj.group(1)
2071         file_extension = os.path.splitext(file_url)[1][1:]
2072
2073         # Search for file title
2074         mobj = re.search(r'<b title="(.*?)">', webpage)
2075         if mobj is None:
2076             self._downloader.report_error(u'unable to extract title')
2077             return
2078         file_title = mobj.group(1).decode('utf-8')
2079
2080         return [{
2081             'id':       file_id.decode('utf-8'),
2082             'url':      file_url.decode('utf-8'),
2083             'uploader': None,
2084             'upload_date':  None,
2085             'title':    file_title,
2086             'ext':      file_extension.decode('utf-8'),
2087         }]
2088
2089
2090 class FacebookIE(InfoExtractor):
2091     """Information Extractor for Facebook"""
2092
2093     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2094     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2095     _NETRC_MACHINE = 'facebook'
2096     IE_NAME = u'facebook'
2097
2098     def report_login(self):
2099         """Report attempt to log in."""
2100         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2101
2102     def _real_initialize(self):
2103         if self._downloader is None:
2104             return
2105
2106         useremail = None
2107         password = None
2108         downloader_params = self._downloader.params
2109
2110         # Attempt to use provided username and password or .netrc data
2111         if downloader_params.get('username', None) is not None:
2112             useremail = downloader_params['username']
2113             password = downloader_params['password']
2114         elif downloader_params.get('usenetrc', False):
2115             try:
2116                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2117                 if info is not None:
2118                     useremail = info[0]
2119                     password = info[2]
2120                 else:
2121                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2122             except (IOError, netrc.NetrcParseError) as err:
2123                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2124                 return
2125
2126         if useremail is None:
2127             return
2128
2129         # Log in
2130         login_form = {
2131             'email': useremail,
2132             'pass': password,
2133             'login': 'Log+In'
2134             }
2135         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2136         try:
2137             self.report_login()
2138             login_results = compat_urllib_request.urlopen(request).read()
2139             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2140                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2141                 return
2142         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2143             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2144             return
2145
2146     def _real_extract(self, url):
2147         mobj = re.match(self._VALID_URL, url)
2148         if mobj is None:
2149             self._downloader.report_error(u'invalid URL: %s' % url)
2150             return
2151         video_id = mobj.group('ID')
2152
2153         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2154         webpage = self._download_webpage(url, video_id)
2155
2156         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2157         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2158         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2159         if not m:
2160             raise ExtractorError(u'Cannot parse data')
2161         data = dict(json.loads(m.group(1)))
2162         params_raw = compat_urllib_parse.unquote(data['params'])
2163         params = json.loads(params_raw)
2164         video_url = params['hd_src']
2165         if not video_url:
2166             video_url = params['sd_src']
2167         if not video_url:
2168             raise ExtractorError(u'Cannot find video URL')
2169         video_duration = int(params['video_duration'])
2170
2171         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2172         if not m:
2173             raise ExtractorError(u'Cannot find title in webpage')
2174         video_title = unescapeHTML(m.group(1))
2175
2176         info = {
2177             'id': video_id,
2178             'title': video_title,
2179             'url': video_url,
2180             'ext': 'mp4',
2181             'duration': video_duration,
2182             'thumbnail': params['thumbnail_src'],
2183         }
2184         return [info]
2185
2186
2187 class BlipTVIE(InfoExtractor):
2188     """Information extractor for blip.tv"""
2189
2190     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2191     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2192     IE_NAME = u'blip.tv'
2193
2194     def report_extraction(self, file_id):
2195         """Report information extraction."""
2196         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2197
2198     def report_direct_download(self, title):
2199         """Report information extraction."""
2200         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2201
2202     def _real_extract(self, url):
2203         mobj = re.match(self._VALID_URL, url)
2204         if mobj is None:
2205             self._downloader.report_error(u'invalid URL: %s' % url)
2206             return
2207
2208         urlp = compat_urllib_parse_urlparse(url)
2209         if urlp.path.startswith('/play/'):
2210             request = compat_urllib_request.Request(url)
2211             response = compat_urllib_request.urlopen(request)
2212             redirecturl = response.geturl()
2213             rurlp = compat_urllib_parse_urlparse(redirecturl)
2214             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2215             url = 'http://blip.tv/a/a-' + file_id
2216             return self._real_extract(url)
2217
2218
2219         if '?' in url:
2220             cchar = '&'
2221         else:
2222             cchar = '?'
2223         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2224         request = compat_urllib_request.Request(json_url)
2225         request.add_header('User-Agent', 'iTunes/10.6.1')
2226         self.report_extraction(mobj.group(1))
2227         info = None
2228         try:
2229             urlh = compat_urllib_request.urlopen(request)
2230             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2231                 basename = url.split('/')[-1]
2232                 title,ext = os.path.splitext(basename)
2233                 title = title.decode('UTF-8')
2234                 ext = ext.replace('.', '')
2235                 self.report_direct_download(title)
2236                 info = {
2237                     'id': title,
2238                     'url': url,
2239                     'uploader': None,
2240                     'upload_date': None,
2241                     'title': title,
2242                     'ext': ext,
2243                     'urlhandle': urlh
2244                 }
2245         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2246             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2247         if info is None: # Regular URL
2248             try:
2249                 json_code_bytes = urlh.read()
2250                 json_code = json_code_bytes.decode('utf-8')
2251             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2252                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2253                 return
2254
2255             try:
2256                 json_data = json.loads(json_code)
2257                 if 'Post' in json_data:
2258                     data = json_data['Post']
2259                 else:
2260                     data = json_data
2261
2262                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2263                 video_url = data['media']['url']
2264                 umobj = re.match(self._URL_EXT, video_url)
2265                 if umobj is None:
2266                     raise ValueError('Can not determine filename extension')
2267                 ext = umobj.group(1)
2268
2269                 info = {
2270                     'id': data['item_id'],
2271                     'url': video_url,
2272                     'uploader': data['display_name'],
2273                     'upload_date': upload_date,
2274                     'title': data['title'],
2275                     'ext': ext,
2276                     'format': data['media']['mimeType'],
2277                     'thumbnail': data['thumbnailUrl'],
2278                     'description': data['description'],
2279                     'player_url': data['embedUrl'],
2280                     'user_agent': 'iTunes/10.6.1',
2281                 }
2282             except (ValueError,KeyError) as err:
2283                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2284                 return
2285
2286         return [info]
2287
2288
2289 class MyVideoIE(InfoExtractor):
2290     """Information Extractor for myvideo.de."""
2291
2292     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2293     IE_NAME = u'myvideo'
2294
2295     def __init__(self, downloader=None):
2296         InfoExtractor.__init__(self, downloader)
2297
2298     def report_extraction(self, video_id):
2299         """Report information extraction."""
2300         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2301
2302     def _real_extract(self,url):
2303         mobj = re.match(self._VALID_URL, url)
2304         if mobj is None:
2305             self._download.report_error(u'invalid URL: %s' % url)
2306             return
2307
2308         video_id = mobj.group(1)
2309
2310         # Get video webpage
2311         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2312         webpage = self._download_webpage(webpage_url, video_id)
2313
2314         self.report_extraction(video_id)
2315         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2316                  webpage)
2317         if mobj is None:
2318             self._downloader.report_error(u'unable to extract media URL')
2319             return
2320         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2321
2322         mobj = re.search('<title>([^<]+)</title>', webpage)
2323         if mobj is None:
2324             self._downloader.report_error(u'unable to extract title')
2325             return
2326
2327         video_title = mobj.group(1)
2328
2329         return [{
2330             'id':       video_id,
2331             'url':      video_url,
2332             'uploader': None,
2333             'upload_date':  None,
2334             'title':    video_title,
2335             'ext':      u'flv',
2336         }]
2337
2338 class ComedyCentralIE(InfoExtractor):
2339     """Information extractor for The Daily Show and Colbert Report """
2340
2341     # urls can be abbreviations like :thedailyshow or :colbert
2342     # urls for episodes like:
2343     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2344     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2345     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2346     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2347                       |(https?://)?(www\.)?
2348                           (?P<showname>thedailyshow|colbertnation)\.com/
2349                          (full-episodes/(?P<episode>.*)|
2350                           (?P<clip>
2351                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2352                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2353                      $"""
2354
2355     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2356
2357     _video_extensions = {
2358         '3500': 'mp4',
2359         '2200': 'mp4',
2360         '1700': 'mp4',
2361         '1200': 'mp4',
2362         '750': 'mp4',
2363         '400': 'mp4',
2364     }
2365     _video_dimensions = {
2366         '3500': '1280x720',
2367         '2200': '960x540',
2368         '1700': '768x432',
2369         '1200': '640x360',
2370         '750': '512x288',
2371         '400': '384x216',
2372     }
2373
2374     @classmethod
2375     def suitable(cls, url):
2376         """Receives a URL and returns True if suitable for this IE."""
2377         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2378
2379     def report_extraction(self, episode_id):
2380         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2381
2382     def report_config_download(self, episode_id, media_id):
2383         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2384
2385     def report_index_download(self, episode_id):
2386         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2387
2388     def _print_formats(self, formats):
2389         print('Available formats:')
2390         for x in formats:
2391             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2392
2393
2394     def _real_extract(self, url):
2395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396         if mobj is None:
2397             self._downloader.report_error(u'invalid URL: %s' % url)
2398             return
2399
2400         if mobj.group('shortname'):
2401             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2402                 url = u'http://www.thedailyshow.com/full-episodes/'
2403             else:
2404                 url = u'http://www.colbertnation.com/full-episodes/'
2405             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2406             assert mobj is not None
2407
2408         if mobj.group('clip'):
2409             if mobj.group('showname') == 'thedailyshow':
2410                 epTitle = mobj.group('tdstitle')
2411             else:
2412                 epTitle = mobj.group('cntitle')
2413             dlNewest = False
2414         else:
2415             dlNewest = not mobj.group('episode')
2416             if dlNewest:
2417                 epTitle = mobj.group('showname')
2418             else:
2419                 epTitle = mobj.group('episode')
2420
2421         req = compat_urllib_request.Request(url)
2422         self.report_extraction(epTitle)
2423         try:
2424             htmlHandle = compat_urllib_request.urlopen(req)
2425             html = htmlHandle.read()
2426             webpage = html.decode('utf-8')
2427         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2429             return
2430         if dlNewest:
2431             url = htmlHandle.geturl()
2432             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2433             if mobj is None:
2434                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2435                 return
2436             if mobj.group('episode') == '':
2437                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2438                 return
2439             epTitle = mobj.group('episode')
2440
2441         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2442
2443         if len(mMovieParams) == 0:
2444             # The Colbert Report embeds the information in a without
2445             # a URL prefix; so extract the alternate reference
2446             # and then add the URL prefix manually.
2447
2448             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2449             if len(altMovieParams) == 0:
2450                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2451                 return
2452             else:
2453                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2454
2455         uri = mMovieParams[0][1]
2456         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2457         self.report_index_download(epTitle)
2458         try:
2459             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2460         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2461             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2462             return
2463
2464         results = []
2465
2466         idoc = xml.etree.ElementTree.fromstring(indexXml)
2467         itemEls = idoc.findall('.//item')
2468         for partNum,itemEl in enumerate(itemEls):
2469             mediaId = itemEl.findall('./guid')[0].text
2470             shortMediaId = mediaId.split(':')[-1]
2471             showId = mediaId.split(':')[-2].replace('.com', '')
2472             officialTitle = itemEl.findall('./title')[0].text
2473             officialDate = itemEl.findall('./pubDate')[0].text
2474
2475             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2476                         compat_urllib_parse.urlencode({'uri': mediaId}))
2477             configReq = compat_urllib_request.Request(configUrl)
2478             self.report_config_download(epTitle, shortMediaId)
2479             try:
2480                 configXml = compat_urllib_request.urlopen(configReq).read()
2481             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2482                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2483                 return
2484
2485             cdoc = xml.etree.ElementTree.fromstring(configXml)
2486             turls = []
2487             for rendition in cdoc.findall('.//rendition'):
2488                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2489                 turls.append(finfo)
2490
2491             if len(turls) == 0:
2492                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2493                 continue
2494
2495             if self._downloader.params.get('listformats', None):
2496                 self._print_formats([i[0] for i in turls])
2497                 return
2498
2499             # For now, just pick the highest bitrate
2500             format,rtmp_video_url = turls[-1]
2501
2502             # Get the format arg from the arg stream
2503             req_format = self._downloader.params.get('format', None)
2504
2505             # Select format if we can find one
2506             for f,v in turls:
2507                 if f == req_format:
2508                     format, rtmp_video_url = f, v
2509                     break
2510
2511             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2512             if not m:
2513                 raise ExtractorError(u'Cannot transform RTMP url')
2514             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2515             video_url = base + m.group('finalid')
2516
2517             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2518             info = {
2519                 'id': shortMediaId,
2520                 'url': video_url,
2521                 'uploader': showId,
2522                 'upload_date': officialDate,
2523                 'title': effTitle,
2524                 'ext': 'mp4',
2525                 'format': format,
2526                 'thumbnail': None,
2527                 'description': officialTitle,
2528             }
2529             results.append(info)
2530
2531         return results
2532
2533
2534 class EscapistIE(InfoExtractor):
2535     """Information extractor for The Escapist """
2536
2537     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2538     IE_NAME = u'escapist'
2539
2540     def report_extraction(self, showName):
2541         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2542
2543     def report_config_download(self, showName):
2544         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2545
2546     def _real_extract(self, url):
2547         mobj = re.match(self._VALID_URL, url)
2548         if mobj is None:
2549             self._downloader.report_error(u'invalid URL: %s' % url)
2550             return
2551         showName = mobj.group('showname')
2552         videoId = mobj.group('episode')
2553
2554         self.report_extraction(showName)
2555         try:
2556             webPage = compat_urllib_request.urlopen(url)
2557             webPageBytes = webPage.read()
2558             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2559             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2561             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2562             return
2563
2564         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2565         description = unescapeHTML(descMatch.group(1))
2566         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2567         imgUrl = unescapeHTML(imgMatch.group(1))
2568         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2569         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2570         configUrlMatch = re.search('config=(.*)$', playerUrl)
2571         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2572
2573         self.report_config_download(showName)
2574         try:
2575             configJSON = compat_urllib_request.urlopen(configUrl)
2576             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2577             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2578         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2580             return
2581
2582         # Technically, it's JavaScript, not JSON
2583         configJSON = configJSON.replace("'", '"')
2584
2585         try:
2586             config = json.loads(configJSON)
2587         except (ValueError,) as err:
2588             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2589             return
2590
2591         playlist = config['playlist']
2592         videoUrl = playlist[1]['url']
2593
2594         info = {
2595             'id': videoId,
2596             'url': videoUrl,
2597             'uploader': showName,
2598             'upload_date': None,
2599             'title': showName,
2600             'ext': 'mp4',
2601             'thumbnail': imgUrl,
2602             'description': description,
2603             'player_url': playerUrl,
2604         }
2605
2606         return [info]
2607
2608 class CollegeHumorIE(InfoExtractor):
2609     """Information extractor for collegehumor.com"""
2610
2611     _WORKING = False
2612     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2613     IE_NAME = u'collegehumor'
2614
2615     def report_manifest(self, video_id):
2616         """Report information extraction."""
2617         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2618
2619     def report_extraction(self, video_id):
2620         """Report information extraction."""
2621         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2622
2623     def _real_extract(self, url):
2624         mobj = re.match(self._VALID_URL, url)
2625         if mobj is None:
2626             self._downloader.report_error(u'invalid URL: %s' % url)
2627             return
2628         video_id = mobj.group('videoid')
2629
2630         info = {
2631             'id': video_id,
2632             'uploader': None,
2633             'upload_date': None,
2634         }
2635
2636         self.report_extraction(video_id)
2637         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2638         try:
2639             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2640         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2641             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2642             return
2643
2644         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2645         try:
2646             videoNode = mdoc.findall('./video')[0]
2647             info['description'] = videoNode.findall('./description')[0].text
2648             info['title'] = videoNode.findall('./caption')[0].text
2649             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2650             manifest_url = videoNode.findall('./file')[0].text
2651         except IndexError:
2652             self._downloader.report_error(u'Invalid metadata XML file')
2653             return
2654
2655         manifest_url += '?hdcore=2.10.3'
2656         self.report_manifest(video_id)
2657         try:
2658             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2660             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2661             return
2662
2663         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2664         try:
2665             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2666             node_id = media_node.attrib['url']
2667             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2668         except IndexError as err:
2669             self._downloader.report_error(u'Invalid manifest file')
2670             return
2671
2672         url_pr = compat_urllib_parse_urlparse(manifest_url)
2673         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2674
2675         info['url'] = url
2676         info['ext'] = 'f4f'
2677         return [info]
2678
2679
2680 class XVideosIE(InfoExtractor):
2681     """Information extractor for xvideos.com"""
2682
2683     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2684     IE_NAME = u'xvideos'
2685
2686     def report_extraction(self, video_id):
2687         """Report information extraction."""
2688         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2689
2690     def _real_extract(self, url):
2691         mobj = re.match(self._VALID_URL, url)
2692         if mobj is None:
2693             self._downloader.report_error(u'invalid URL: %s' % url)
2694             return
2695         video_id = mobj.group(1)
2696
2697         webpage = self._download_webpage(url, video_id)
2698
2699         self.report_extraction(video_id)
2700
2701
2702         # Extract video URL
2703         mobj = re.search(r'flv_url=(.+?)&', webpage)
2704         if mobj is None:
2705             self._downloader.report_error(u'unable to extract video url')
2706             return
2707         video_url = compat_urllib_parse.unquote(mobj.group(1))
2708
2709
2710         # Extract title
2711         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2712         if mobj is None:
2713             self._downloader.report_error(u'unable to extract video title')
2714             return
2715         video_title = mobj.group(1)
2716
2717
2718         # Extract video thumbnail
2719         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2720         if mobj is None:
2721             self._downloader.report_error(u'unable to extract video thumbnail')
2722             return
2723         video_thumbnail = mobj.group(0)
2724
2725         info = {
2726             'id': video_id,
2727             'url': video_url,
2728             'uploader': None,
2729             'upload_date': None,
2730             'title': video_title,
2731             'ext': 'flv',
2732             'thumbnail': video_thumbnail,
2733             'description': None,
2734         }
2735
2736         return [info]
2737
2738
2739 class SoundcloudIE(InfoExtractor):
2740     """Information extractor for soundcloud.com
2741        To access the media, the uid of the song and a stream token
2742        must be extracted from the page source and the script must make
2743        a request to media.soundcloud.com/crossdomain.xml. Then
2744        the media can be grabbed by requesting from an url composed
2745        of the stream token and uid
2746      """
2747
2748     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2749     IE_NAME = u'soundcloud'
2750
2751     def __init__(self, downloader=None):
2752         InfoExtractor.__init__(self, downloader)
2753
2754     def report_resolve(self, video_id):
2755         """Report information extraction."""
2756         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2757
2758     def report_extraction(self, video_id):
2759         """Report information extraction."""
2760         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2761
2762     def _real_extract(self, url):
2763         mobj = re.match(self._VALID_URL, url)
2764         if mobj is None:
2765             self._downloader.report_error(u'invalid URL: %s' % url)
2766             return
2767
2768         # extract uploader (which is in the url)
2769         uploader = mobj.group(1)
2770         # extract simple title (uploader + slug of song title)
2771         slug_title =  mobj.group(2)
2772         simple_title = uploader + u'-' + slug_title
2773
2774         self.report_resolve('%s/%s' % (uploader, slug_title))
2775
2776         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2777         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778         request = compat_urllib_request.Request(resolv_url)
2779         try:
2780             info_json_bytes = compat_urllib_request.urlopen(request).read()
2781             info_json = info_json_bytes.decode('utf-8')
2782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2784             return
2785
2786         info = json.loads(info_json)
2787         video_id = info['id']
2788         self.report_extraction('%s/%s' % (uploader, slug_title))
2789
2790         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791         request = compat_urllib_request.Request(streams_url)
2792         try:
2793             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2794             stream_json = stream_json_bytes.decode('utf-8')
2795         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2797             return
2798
2799         streams = json.loads(stream_json)
2800         mediaURL = streams['http_mp3_128_url']
2801
2802         return [{
2803             'id':       info['id'],
2804             'url':      mediaURL,
2805             'uploader': info['user']['username'],
2806             'upload_date':  info['created_at'],
2807             'title':    info['title'],
2808             'ext':      u'mp3',
2809             'description': info['description'],
2810         }]
2811
2812 class SoundcloudSetIE(InfoExtractor):
2813     """Information extractor for soundcloud.com sets
2814        To access the media, the uid of the song and a stream token
2815        must be extracted from the page source and the script must make
2816        a request to media.soundcloud.com/crossdomain.xml. Then
2817        the media can be grabbed by requesting from an url composed
2818        of the stream token and uid
2819      """
2820
2821     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2822     IE_NAME = u'soundcloud'
2823
2824     def __init__(self, downloader=None):
2825         InfoExtractor.__init__(self, downloader)
2826
2827     def report_resolve(self, video_id):
2828         """Report information extraction."""
2829         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2830
2831     def report_extraction(self, video_id):
2832         """Report information extraction."""
2833         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2834
2835     def _real_extract(self, url):
2836         mobj = re.match(self._VALID_URL, url)
2837         if mobj is None:
2838             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2839             return
2840
2841         # extract uploader (which is in the url)
2842         uploader = mobj.group(1)
2843         # extract simple title (uploader + slug of song title)
2844         slug_title =  mobj.group(2)
2845         simple_title = uploader + u'-' + slug_title
2846
2847         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2848
2849         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2850         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2851         request = compat_urllib_request.Request(resolv_url)
2852         try:
2853             info_json_bytes = compat_urllib_request.urlopen(request).read()
2854             info_json = info_json_bytes.decode('utf-8')
2855         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2856             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2857             return
2858
2859         videos = []
2860         info = json.loads(info_json)
2861         if 'errors' in info:
2862             for err in info['errors']:
2863                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2864             return
2865
2866         for track in info['tracks']:
2867             video_id = track['id']
2868             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2869
2870             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2871             request = compat_urllib_request.Request(streams_url)
2872             try:
2873                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2874                 stream_json = stream_json_bytes.decode('utf-8')
2875             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2876                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2877                 return
2878
2879             streams = json.loads(stream_json)
2880             mediaURL = streams['http_mp3_128_url']
2881
2882             videos.append({
2883                 'id':       video_id,
2884                 'url':      mediaURL,
2885                 'uploader': track['user']['username'],
2886                 'upload_date':  track['created_at'],
2887                 'title':    track['title'],
2888                 'ext':      u'mp3',
2889                 'description': track['description'],
2890             })
2891         return videos
2892
2893
2894 class InfoQIE(InfoExtractor):
2895     """Information extractor for infoq.com"""
2896     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2897
2898     def report_extraction(self, video_id):
2899         """Report information extraction."""
2900         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2901
2902     def _real_extract(self, url):
2903         mobj = re.match(self._VALID_URL, url)
2904         if mobj is None:
2905             self._downloader.report_error(u'invalid URL: %s' % url)
2906             return
2907
2908         webpage = self._download_webpage(url, video_id=url)
2909         self.report_extraction(url)
2910
2911         # Extract video URL
2912         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2913         if mobj is None:
2914             self._downloader.report_error(u'unable to extract video url')
2915             return
2916         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2917         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2918
2919         # Extract title
2920         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2921         if mobj is None:
2922             self._downloader.report_error(u'unable to extract video title')
2923             return
2924         video_title = mobj.group(1)
2925
2926         # Extract description
2927         video_description = u'No description available.'
2928         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2929         if mobj is not None:
2930             video_description = mobj.group(1)
2931
2932         video_filename = video_url.split('/')[-1]
2933         video_id, extension = video_filename.split('.')
2934
2935         info = {
2936             'id': video_id,
2937             'url': video_url,
2938             'uploader': None,
2939             'upload_date': None,
2940             'title': video_title,
2941             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2942             'thumbnail': None,
2943             'description': video_description,
2944         }
2945
2946         return [info]
2947
2948 class MixcloudIE(InfoExtractor):
2949     """Information extractor for www.mixcloud.com"""
2950
2951     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2952     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2953     IE_NAME = u'mixcloud'
2954
2955     def __init__(self, downloader=None):
2956         InfoExtractor.__init__(self, downloader)
2957
2958     def report_download_json(self, file_id):
2959         """Report JSON download."""
2960         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2961
2962     def report_extraction(self, file_id):
2963         """Report information extraction."""
2964         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2965
2966     def get_urls(self, jsonData, fmt, bitrate='best'):
2967         """Get urls from 'audio_formats' section in json"""
2968         file_url = None
2969         try:
2970             bitrate_list = jsonData[fmt]
2971             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2972                 bitrate = max(bitrate_list) # select highest
2973
2974             url_list = jsonData[fmt][bitrate]
2975         except TypeError: # we have no bitrate info.
2976             url_list = jsonData[fmt]
2977         return url_list
2978
2979     def check_urls(self, url_list):
2980         """Returns 1st active url from list"""
2981         for url in url_list:
2982             try:
2983                 compat_urllib_request.urlopen(url)
2984                 return url
2985             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2986                 url = None
2987
2988         return None
2989
2990     def _print_formats(self, formats):
2991         print('Available formats:')
2992         for fmt in formats.keys():
2993             for b in formats[fmt]:
2994                 try:
2995                     ext = formats[fmt][b][0]
2996                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2997                 except TypeError: # we have no bitrate info
2998                     ext = formats[fmt][0]
2999                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3000                     break
3001
3002     def _real_extract(self, url):
3003         mobj = re.match(self._VALID_URL, url)
3004         if mobj is None:
3005             self._downloader.report_error(u'invalid URL: %s' % url)
3006             return
3007         # extract uploader & filename from url
3008         uploader = mobj.group(1).decode('utf-8')
3009         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3010
3011         # construct API request
3012         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3013         # retrieve .json file with links to files
3014         request = compat_urllib_request.Request(file_url)
3015         try:
3016             self.report_download_json(file_url)
3017             jsonData = compat_urllib_request.urlopen(request).read()
3018         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3019             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3020             return
3021
3022         # parse JSON
3023         json_data = json.loads(jsonData)
3024         player_url = json_data['player_swf_url']
3025         formats = dict(json_data['audio_formats'])
3026
3027         req_format = self._downloader.params.get('format', None)
3028         bitrate = None
3029
3030         if self._downloader.params.get('listformats', None):
3031             self._print_formats(formats)
3032             return
3033
3034         if req_format is None or req_format == 'best':
3035             for format_param in formats.keys():
3036                 url_list = self.get_urls(formats, format_param)
3037                 # check urls
3038                 file_url = self.check_urls(url_list)
3039                 if file_url is not None:
3040                     break # got it!
3041         else:
3042             if req_format not in formats:
3043                 self._downloader.report_error(u'format is not available')
3044                 return
3045
3046             url_list = self.get_urls(formats, req_format)
3047             file_url = self.check_urls(url_list)
3048             format_param = req_format
3049
3050         return [{
3051             'id': file_id.decode('utf-8'),
3052             'url': file_url.decode('utf-8'),
3053             'uploader': uploader.decode('utf-8'),
3054             'upload_date': None,
3055             'title': json_data['name'],
3056             'ext': file_url.split('.')[-1].decode('utf-8'),
3057             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3058             'thumbnail': json_data['thumbnail_url'],
3059             'description': json_data['description'],
3060             'player_url': player_url.decode('utf-8'),
3061         }]
3062
3063 class StanfordOpenClassroomIE(InfoExtractor):
3064     """Information extractor for Stanford's Open ClassRoom"""
3065
3066     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3067     IE_NAME = u'stanfordoc'
3068
3069     def report_download_webpage(self, objid):
3070         """Report information extraction."""
3071         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3072
3073     def report_extraction(self, video_id):
3074         """Report information extraction."""
3075         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3076
3077     def _real_extract(self, url):
3078         mobj = re.match(self._VALID_URL, url)
3079         if mobj is None:
3080             raise ExtractorError(u'Invalid URL: %s' % url)
3081
3082         if mobj.group('course') and mobj.group('video'): # A specific video
3083             course = mobj.group('course')
3084             video = mobj.group('video')
3085             info = {
3086                 'id': course + '_' + video,
3087                 'uploader': None,
3088                 'upload_date': None,
3089             }
3090
3091             self.report_extraction(info['id'])
3092             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3093             xmlUrl = baseUrl + video + '.xml'
3094             try:
3095                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3096             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3097                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3098                 return
3099             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3100             try:
3101                 info['title'] = mdoc.findall('./title')[0].text
3102                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3103             except IndexError:
3104                 self._downloader.report_error(u'Invalid metadata XML file')
3105                 return
3106             info['ext'] = info['url'].rpartition('.')[2]
3107             return [info]
3108         elif mobj.group('course'): # A course page
3109             course = mobj.group('course')
3110             info = {
3111                 'id': course,
3112                 'type': 'playlist',
3113                 'uploader': None,
3114                 'upload_date': None,
3115             }
3116
3117             coursepage = self._download_webpage(url, info['id'],
3118                                         note='Downloading course info page',
3119                                         errnote='Unable to download course info page')
3120
3121             m = re.search('<h1>([^<]+)</h1>', coursepage)
3122             if m:
3123                 info['title'] = unescapeHTML(m.group(1))
3124             else:
3125                 info['title'] = info['id']
3126
3127             m = re.search('<description>([^<]+)</description>', coursepage)
3128             if m:
3129                 info['description'] = unescapeHTML(m.group(1))
3130
3131             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3132             info['list'] = [
3133                 {
3134                     'type': 'reference',
3135                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3136                 }
3137                     for vpage in links]
3138             results = []
3139             for entry in info['list']:
3140                 assert entry['type'] == 'reference'
3141                 results += self.extract(entry['url'])
3142             return results
3143         else: # Root page
3144             info = {
3145                 'id': 'Stanford OpenClassroom',
3146                 'type': 'playlist',
3147                 'uploader': None,
3148                 'upload_date': None,
3149             }
3150
3151             self.report_download_webpage(info['id'])
3152             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3153             try:
3154                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3155             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3156                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3157                 return
3158
3159             info['title'] = info['id']
3160
3161             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3162             info['list'] = [
3163                 {
3164                     'type': 'reference',
3165                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3166                 }
3167                     for cpage in links]
3168
3169             results = []
3170             for entry in info['list']:
3171                 assert entry['type'] == 'reference'
3172                 results += self.extract(entry['url'])
3173             return results
3174
3175 class MTVIE(InfoExtractor):
3176     """Information extractor for MTV.com"""
3177
3178     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3179     IE_NAME = u'mtv'
3180
3181     def report_extraction(self, video_id):
3182         """Report information extraction."""
3183         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3184
3185     def _real_extract(self, url):
3186         mobj = re.match(self._VALID_URL, url)
3187         if mobj is None:
3188             self._downloader.report_error(u'invalid URL: %s' % url)
3189             return
3190         if not mobj.group('proto'):
3191             url = 'http://' + url
3192         video_id = mobj.group('videoid')
3193
3194         webpage = self._download_webpage(url, video_id)
3195
3196         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3197         if mobj is None:
3198             self._downloader.report_error(u'unable to extract song name')
3199             return
3200         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3201         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3202         if mobj is None:
3203             self._downloader.report_error(u'unable to extract performer')
3204             return
3205         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3206         video_title = performer + ' - ' + song_name
3207
3208         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3209         if mobj is None:
3210             self._downloader.report_error(u'unable to mtvn_uri')
3211             return
3212         mtvn_uri = mobj.group(1)
3213
3214         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3215         if mobj is None:
3216             self._downloader.report_error(u'unable to extract content id')
3217             return
3218         content_id = mobj.group(1)
3219
3220         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3221         self.report_extraction(video_id)
3222         request = compat_urllib_request.Request(videogen_url)
3223         try:
3224             metadataXml = compat_urllib_request.urlopen(request).read()
3225         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3226             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3227             return
3228
3229         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3230         renditions = mdoc.findall('.//rendition')
3231
3232         # For now, always pick the highest quality.
3233         rendition = renditions[-1]
3234
3235         try:
3236             _,_,ext = rendition.attrib['type'].partition('/')
3237             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3238             video_url = rendition.find('./src').text
3239         except KeyError:
3240             self._downloader.trouble('Invalid rendition field.')
3241             return
3242
3243         info = {
3244             'id': video_id,
3245             'url': video_url,
3246             'uploader': performer,
3247             'upload_date': None,
3248             'title': video_title,
3249             'ext': ext,
3250             'format': format,
3251         }
3252
3253         return [info]
3254
3255
3256 class YoukuIE(InfoExtractor):
3257     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3258
3259     def report_download_webpage(self, file_id):
3260         """Report webpage download."""
3261         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3262
3263     def report_extraction(self, file_id):
3264         """Report information extraction."""
3265         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3266
3267     def _gen_sid(self):
3268         nowTime = int(time.time() * 1000)
3269         random1 = random.randint(1000,1998)
3270         random2 = random.randint(1000,9999)
3271
3272         return "%d%d%d" %(nowTime,random1,random2)
3273
3274     def _get_file_ID_mix_string(self, seed):
3275         mixed = []
3276         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3277         seed = float(seed)
3278         for i in range(len(source)):
3279             seed  =  (seed * 211 + 30031 ) % 65536
3280             index  =  math.floor(seed / 65536 * len(source) )
3281             mixed.append(source[int(index)])
3282             source.remove(source[int(index)])
3283         #return ''.join(mixed)
3284         return mixed
3285
3286     def _get_file_id(self, fileId, seed):
3287         mixed = self._get_file_ID_mix_string(seed)
3288         ids = fileId.split('*')
3289         realId = []
3290         for ch in ids:
3291             if ch:
3292                 realId.append(mixed[int(ch)])
3293         return ''.join(realId)
3294
3295     def _real_extract(self, url):
3296         mobj = re.match(self._VALID_URL, url)
3297         if mobj is None:
3298             self._downloader.report_error(u'invalid URL: %s' % url)
3299             return
3300         video_id = mobj.group('ID')
3301
3302         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3303
3304         request = compat_urllib_request.Request(info_url, None, std_headers)
3305         try:
3306             self.report_download_webpage(video_id)
3307             jsondata = compat_urllib_request.urlopen(request).read()
3308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3309             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3310             return
3311
3312         self.report_extraction(video_id)
3313         try:
3314             jsonstr = jsondata.decode('utf-8')
3315             config = json.loads(jsonstr)
3316
3317             video_title =  config['data'][0]['title']
3318             seed = config['data'][0]['seed']
3319
3320             format = self._downloader.params.get('format', None)
3321             supported_format = list(config['data'][0]['streamfileids'].keys())
3322
3323             if format is None or format == 'best':
3324                 if 'hd2' in supported_format:
3325                     format = 'hd2'
3326                 else:
3327                     format = 'flv'
3328                 ext = u'flv'
3329             elif format == 'worst':
3330                 format = 'mp4'
3331                 ext = u'mp4'
3332             else:
3333                 format = 'flv'
3334                 ext = u'flv'
3335
3336
3337             fileid = config['data'][0]['streamfileids'][format]
3338             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3339         except (UnicodeDecodeError, ValueError, KeyError):
3340             self._downloader.report_error(u'unable to extract info section')
3341             return
3342
3343         files_info=[]
3344         sid = self._gen_sid()
3345         fileid = self._get_file_id(fileid, seed)
3346
3347         #column 8,9 of fileid represent the segment number
3348         #fileid[7:9] should be changed
3349         for index, key in enumerate(keys):
3350
3351             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3352             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3353
3354             info = {
3355                 'id': '%s_part%02d' % (video_id, index),
3356                 'url': download_url,
3357                 'uploader': None,
3358                 'upload_date': None,
3359                 'title': video_title,
3360                 'ext': ext,
3361             }
3362             files_info.append(info)
3363
3364         return files_info
3365
3366
3367 class XNXXIE(InfoExtractor):
3368     """Information extractor for xnxx.com"""
3369
3370     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3371     IE_NAME = u'xnxx'
3372     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3373     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3374     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3375
3376     def report_webpage(self, video_id):
3377         """Report information extraction"""
3378         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3379
3380     def report_extraction(self, video_id):
3381         """Report information extraction"""
3382         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3383
3384     def _real_extract(self, url):
3385         mobj = re.match(self._VALID_URL, url)
3386         if mobj is None:
3387             self._downloader.report_error(u'invalid URL: %s' % url)
3388             return
3389         video_id = mobj.group(1)
3390
3391         self.report_webpage(video_id)
3392
3393         # Get webpage content
3394         try:
3395             webpage_bytes = compat_urllib_request.urlopen(url).read()
3396             webpage = webpage_bytes.decode('utf-8')
3397         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3398             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3399             return
3400
3401         result = re.search(self.VIDEO_URL_RE, webpage)
3402         if result is None:
3403             self._downloader.report_error(u'unable to extract video url')
3404             return
3405         video_url = compat_urllib_parse.unquote(result.group(1))
3406
3407         result = re.search(self.VIDEO_TITLE_RE, webpage)
3408         if result is None:
3409             self._downloader.report_error(u'unable to extract video title')
3410             return
3411         video_title = result.group(1)
3412
3413         result = re.search(self.VIDEO_THUMB_RE, webpage)
3414         if result is None:
3415             self._downloader.report_error(u'unable to extract video thumbnail')
3416             return
3417         video_thumbnail = result.group(1)
3418
3419         return [{
3420             'id': video_id,
3421             'url': video_url,
3422             'uploader': None,
3423             'upload_date': None,
3424             'title': video_title,
3425             'ext': 'flv',
3426             'thumbnail': video_thumbnail,
3427             'description': None,
3428         }]
3429
3430
3431 class GooglePlusIE(InfoExtractor):
3432     """Information extractor for plus.google.com."""
3433
3434     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3435     IE_NAME = u'plus.google'
3436
3437     def __init__(self, downloader=None):
3438         InfoExtractor.__init__(self, downloader)
3439
3440     def report_extract_entry(self, url):
3441         """Report downloading extry"""
3442         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3443
3444     def report_date(self, upload_date):
3445         """Report downloading extry"""
3446         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3447
3448     def report_uploader(self, uploader):
3449         """Report downloading extry"""
3450         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3451
3452     def report_title(self, video_title):
3453         """Report downloading extry"""
3454         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3455
3456     def report_extract_vid_page(self, video_page):
3457         """Report information extraction."""
3458         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3459
3460     def _real_extract(self, url):
3461         # Extract id from URL
3462         mobj = re.match(self._VALID_URL, url)
3463         if mobj is None:
3464             self._downloader.report_error(u'Invalid URL: %s' % url)
3465             return
3466
3467         post_url = mobj.group(0)
3468         video_id = mobj.group(1)
3469
3470         video_extension = 'flv'
3471
3472         # Step 1, Retrieve post webpage to extract further information
3473         self.report_extract_entry(post_url)
3474         request = compat_urllib_request.Request(post_url)
3475         try:
3476             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3477         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3478             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3479             return
3480
3481         # Extract update date
3482         upload_date = None
3483         pattern = 'title="Timestamp">(.*?)</a>'
3484         mobj = re.search(pattern, webpage)
3485         if mobj:
3486             upload_date = mobj.group(1)
3487             # Convert timestring to a format suitable for filename
3488             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3489             upload_date = upload_date.strftime('%Y%m%d')
3490         self.report_date(upload_date)
3491
3492         # Extract uploader
3493         uploader = None
3494         pattern = r'rel\="author".*?>(.*?)</a>'
3495         mobj = re.search(pattern, webpage)
3496         if mobj:
3497             uploader = mobj.group(1)
3498         self.report_uploader(uploader)
3499
3500         # Extract title
3501         # Get the first line for title
3502         video_title = u'NA'
3503         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3504         mobj = re.search(pattern, webpage)
3505         if mobj:
3506             video_title = mobj.group(1)
3507         self.report_title(video_title)
3508
3509         # Step 2, Stimulate clicking the image box to launch video
3510         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3511         mobj = re.search(pattern, webpage)
3512         if mobj is None:
3513             self._downloader.report_error(u'unable to extract video page URL')
3514
3515         video_page = mobj.group(1)
3516         request = compat_urllib_request.Request(video_page)
3517         try:
3518             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3519         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3520             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3521             return
3522         self.report_extract_vid_page(video_page)
3523
3524
3525         # Extract video links on video page
3526         """Extract video links of all sizes"""
3527         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3528         mobj = re.findall(pattern, webpage)
3529         if len(mobj) == 0:
3530             self._downloader.report_error(u'unable to extract video links')
3531
3532         # Sort in resolution
3533         links = sorted(mobj)
3534
3535         # Choose the lowest of the sort, i.e. highest resolution
3536         video_url = links[-1]
3537         # Only get the url. The resolution part in the tuple has no use anymore
3538         video_url = video_url[-1]
3539         # Treat escaped \u0026 style hex
3540         try:
3541             video_url = video_url.decode("unicode_escape")
3542         except AttributeError: # Python 3
3543             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3544
3545
3546         return [{
3547             'id':       video_id,
3548             'url':      video_url,
3549             'uploader': uploader,
3550             'upload_date':  upload_date,
3551             'title':    video_title,
3552             'ext':      video_extension,
3553         }]
3554
3555 class NBAIE(InfoExtractor):
3556     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3557     IE_NAME = u'nba'
3558
3559     def _real_extract(self, url):
3560         mobj = re.match(self._VALID_URL, url)
3561         if mobj is None:
3562             self._downloader.report_error(u'invalid URL: %s' % url)
3563             return
3564
3565         video_id = mobj.group(1)
3566         if video_id.endswith('/index.html'):
3567             video_id = video_id[:-len('/index.html')]
3568
3569         webpage = self._download_webpage(url, video_id)
3570
3571         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3572         def _findProp(rexp, default=None):
3573             m = re.search(rexp, webpage)
3574             if m:
3575                 return unescapeHTML(m.group(1))
3576             else:
3577                 return default
3578
3579         shortened_video_id = video_id.rpartition('/')[2]
3580         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3581         info = {
3582             'id': shortened_video_id,
3583             'url': video_url,
3584             'ext': 'mp4',
3585             'title': title,
3586             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3587             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3588         }
3589         return [info]
3590
3591 class JustinTVIE(InfoExtractor):
3592     """Information extractor for justin.tv and twitch.tv"""
3593     # TODO: One broadcast may be split into multiple videos. The key
3594     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3595     # starts at 1 and increases. Can we treat all parts as one video?
3596
3597     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3598         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3599     _JUSTIN_PAGE_LIMIT = 100
3600     IE_NAME = u'justin.tv'
3601
3602     def report_extraction(self, file_id):
3603         """Report information extraction."""
3604         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3605
3606     def report_download_page(self, channel, offset):
3607         """Report attempt to download a single page of videos."""
3608         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3609                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3610
3611     # Return count of items, list of *valid* items
3612     def _parse_page(self, url):
3613         try:
3614             urlh = compat_urllib_request.urlopen(url)
3615             webpage_bytes = urlh.read()
3616             webpage = webpage_bytes.decode('utf-8', 'ignore')
3617         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3618             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3619             return
3620
3621         response = json.loads(webpage)
3622         if type(response) != list:
3623             error_text = response.get('error', 'unknown error')
3624             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3625             return
3626         info = []
3627         for clip in response:
3628             video_url = clip['video_file_url']
3629             if video_url:
3630                 video_extension = os.path.splitext(video_url)[1][1:]
3631                 video_date = re.sub('-', '', clip['start_time'][:10])
3632                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3633                 video_id = clip['id']
3634                 video_title = clip.get('title', video_id)
3635                 info.append({
3636                     'id': video_id,
3637                     'url': video_url,
3638                     'title': video_title,
3639                     'uploader': clip.get('channel_name', video_uploader_id),
3640                     'uploader_id': video_uploader_id,
3641                     'upload_date': video_date,
3642                     'ext': video_extension,
3643                 })
3644         return (len(response), info)
3645
3646     def _real_extract(self, url):
3647         mobj = re.match(self._VALID_URL, url)
3648         if mobj is None:
3649             self._downloader.report_error(u'invalid URL: %s' % url)
3650             return
3651
3652         api = 'http://api.justin.tv'
3653         video_id = mobj.group(mobj.lastindex)
3654         paged = False
3655         if mobj.lastindex == 1:
3656             paged = True
3657             api += '/channel/archives/%s.json'
3658         else:
3659             api += '/broadcast/by_archive/%s.json'
3660         api = api % (video_id,)
3661
3662         self.report_extraction(video_id)
3663
3664         info = []
3665         offset = 0
3666         limit = self._JUSTIN_PAGE_LIMIT
3667         while True:
3668             if paged:
3669                 self.report_download_page(video_id, offset)
3670             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3671             page_count, page_info = self._parse_page(page_url)
3672             info.extend(page_info)
3673             if not paged or page_count != limit:
3674                 break
3675             offset += limit
3676         return info
3677
3678 class FunnyOrDieIE(InfoExtractor):
3679     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3680
3681     def _real_extract(self, url):
3682         mobj = re.match(self._VALID_URL, url)
3683         if mobj is None:
3684             self._downloader.report_error(u'invalid URL: %s' % url)
3685             return
3686
3687         video_id = mobj.group('id')
3688         webpage = self._download_webpage(url, video_id)
3689
3690         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3691         if not m:
3692             self._downloader.report_error(u'unable to find video information')
3693         video_url = unescapeHTML(m.group('url'))
3694
3695         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3696         if not m:
3697             self._downloader.trouble(u'Cannot find video title')
3698         title = clean_html(m.group('title'))
3699
3700         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3701         if m:
3702             desc = unescapeHTML(m.group('desc'))
3703         else:
3704             desc = None
3705
3706         info = {
3707             'id': video_id,
3708             'url': video_url,
3709             'ext': 'mp4',
3710             'title': title,
3711             'description': desc,
3712         }
3713         return [info]
3714
3715 class SteamIE(InfoExtractor):
3716     _VALID_URL = r"""http://store.steampowered.com/
3717                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3718                 (?P<gameID>\d+)/?
3719                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3720                 """
3721
3722     @classmethod
3723     def suitable(cls, url):
3724         """Receives a URL and returns True if suitable for this IE."""
3725         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3726
3727     def _real_extract(self, url):
3728         m = re.match(self._VALID_URL, url, re.VERBOSE)
3729         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3730         gameID = m.group('gameID')
3731         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3732         webpage = self._download_webpage(videourl, gameID)
3733         mweb = re.finditer(urlRE, webpage)
3734         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3735         titles = re.finditer(namesRE, webpage)
3736         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3737         thumbs = re.finditer(thumbsRE, webpage)
3738         videos = []
3739         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3740             video_id = vid.group('videoID')
3741             title = vtitle.group('videoName')
3742             video_url = vid.group('videoURL')
3743             video_thumb = thumb.group('thumbnail')
3744             if not video_url:
3745                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3746             info = {
3747                 'id':video_id,
3748                 'url':video_url,
3749                 'ext': 'flv',
3750                 'title': unescapeHTML(title),
3751                 'thumbnail': video_thumb
3752                   }
3753             videos.append(info)
3754         return videos
3755
3756 class UstreamIE(InfoExtractor):
3757     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3758     IE_NAME = u'ustream'
3759
3760     def _real_extract(self, url):
3761         m = re.match(self._VALID_URL, url)
3762         video_id = m.group('videoID')
3763         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3764         webpage = self._download_webpage(url, video_id)
3765         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3766         title = m.group('title')
3767         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3768         uploader = m.group('uploader')
3769         info = {
3770                 'id':video_id,
3771                 'url':video_url,
3772                 'ext': 'flv',
3773                 'title': title,
3774                 'uploader': uploader
3775                   }
3776         return [info]
3777
3778 class WorldStarHipHopIE(InfoExtractor):
3779     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3780     IE_NAME = u'WorldStarHipHop'
3781
3782     def _real_extract(self, url):
3783         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3784
3785         webpage_src = compat_urllib_request.urlopen(url).read()
3786         webpage_src = webpage_src.decode('utf-8')
3787
3788         mobj = re.search(_src_url, webpage_src)
3789
3790         m = re.match(self._VALID_URL, url)
3791         video_id = m.group('id')
3792
3793         if mobj is not None:
3794             video_url = mobj.group()
3795             if 'mp4' in video_url:
3796                 ext = 'mp4'
3797             else:
3798                 ext = 'flv'
3799         else:
3800             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3801             return
3802
3803         _title = r"""<title>(.*)</title>"""
3804
3805         mobj = re.search(_title, webpage_src)
3806
3807         if mobj is not None:
3808             title = mobj.group(1)
3809         else:
3810             title = 'World Start Hip Hop - %s' % time.ctime()
3811
3812         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3813         mobj = re.search(_thumbnail, webpage_src)
3814
3815         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3816         if mobj is not None:
3817             thumbnail = mobj.group(1)
3818         else:
3819             _title = r"""candytitles.*>(.*)</span>"""
3820             mobj = re.search(_title, webpage_src)
3821             if mobj is not None:
3822                 title = mobj.group(1)
3823             thumbnail = None
3824
3825         results = [{
3826                     'id': video_id,
3827                     'url' : video_url,
3828                     'title' : title,
3829                     'thumbnail' : thumbnail,
3830                     'ext' : ext,
3831                     }]
3832         return results
3833
3834 class RBMARadioIE(InfoExtractor):
3835     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3836
3837     def _real_extract(self, url):
3838         m = re.match(self._VALID_URL, url)
3839         video_id = m.group('videoID')
3840
3841         webpage = self._download_webpage(url, video_id)
3842         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3843         if not m:
3844             raise ExtractorError(u'Cannot find metadata')
3845         json_data = m.group(1)
3846
3847         try:
3848             data = json.loads(json_data)
3849         except ValueError as e:
3850             raise ExtractorError(u'Invalid JSON: ' + str(e))
3851
3852         video_url = data['akamai_url'] + '&cbr=256'
3853         url_parts = compat_urllib_parse_urlparse(video_url)
3854         video_ext = url_parts.path.rpartition('.')[2]
3855         info = {
3856                 'id': video_id,
3857                 'url': video_url,
3858                 'ext': video_ext,
3859                 'title': data['title'],
3860                 'description': data.get('teaser_text'),
3861                 'location': data.get('country_of_origin'),
3862                 'uploader': data.get('host', {}).get('name'),
3863                 'uploader_id': data.get('host', {}).get('slug'),
3864                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3865                 'duration': data.get('duration'),
3866         }
3867         return [info]
3868
3869
3870 class YouPornIE(InfoExtractor):
3871     """Information extractor for youporn.com."""
3872     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3873
3874     def _print_formats(self, formats):
3875         """Print all available formats"""
3876         print(u'Available formats:')
3877         print(u'ext\t\tformat')
3878         print(u'---------------------------------')
3879         for format in formats:
3880             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3881
3882     def _specific(self, req_format, formats):
3883         for x in formats:
3884             if(x["format"]==req_format):
3885                 return x
3886         return None
3887
3888     def _real_extract(self, url):
3889         mobj = re.match(self._VALID_URL, url)
3890         if mobj is None:
3891             self._downloader.report_error(u'invalid URL: %s' % url)
3892             return
3893
3894         video_id = mobj.group('videoid')
3895
3896         req = compat_urllib_request.Request(url)
3897         req.add_header('Cookie', 'age_verified=1')
3898         webpage = self._download_webpage(req, video_id)
3899
3900         # Get the video title
3901         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3902         if result is None:
3903             raise ExtractorError(u'Unable to extract video title')
3904         video_title = result.group('title').strip()
3905
3906         # Get the video date
3907         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3908         if result is None:
3909             self._downloader.report_warning(u'unable to extract video date')
3910             upload_date = None
3911         else:
3912             upload_date = result.group('date').strip()
3913
3914         # Get the video uploader
3915         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3916         if result is None:
3917             self._downloader.report_warning(u'unable to extract uploader')
3918             video_uploader = None
3919         else:
3920             video_uploader = result.group('uploader').strip()
3921             video_uploader = clean_html( video_uploader )
3922
3923         # Get all of the formats available
3924         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3925         result = re.search(DOWNLOAD_LIST_RE, webpage)
3926         if result is None:
3927             raise ExtractorError(u'Unable to extract download list')
3928         download_list_html = result.group('download_list').strip()
3929
3930         # Get all of the links from the page
3931         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3932         links = re.findall(LINK_RE, download_list_html)
3933         if(len(links) == 0):
3934             raise ExtractorError(u'ERROR: no known formats available for video')
3935
3936         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3937
3938         formats = []
3939         for link in links:
3940
3941             # A link looks like this:
3942             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3943             # A path looks like this:
3944             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3945             video_url = unescapeHTML( link )
3946             path = compat_urllib_parse_urlparse( video_url ).path
3947             extension = os.path.splitext( path )[1][1:]
3948             format = path.split('/')[4].split('_')[:2]
3949             size = format[0]
3950             bitrate = format[1]
3951             format = "-".join( format )
3952             title = u'%s-%s-%s' % (video_title, size, bitrate)
3953
3954             formats.append({
3955                 'id': video_id,
3956                 'url': video_url,
3957                 'uploader': video_uploader,
3958                 'upload_date': upload_date,
3959                 'title': title,
3960                 'ext': extension,
3961                 'format': format,
3962                 'thumbnail': None,
3963                 'description': None,
3964                 'player_url': None
3965             })
3966
3967         if self._downloader.params.get('listformats', None):
3968             self._print_formats(formats)
3969             return
3970
3971         req_format = self._downloader.params.get('format', None)
3972         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3973
3974         if req_format is None or req_format == 'best':
3975             return [formats[0]]
3976         elif req_format == 'worst':
3977             return [formats[-1]]
3978         elif req_format in ('-1', 'all'):
3979             return formats
3980         else:
3981             format = self._specific( req_format, formats )
3982             if result is None:
3983                 self._downloader.report_error(u'requested format not available')
3984                 return
3985             return [format]
3986
3987
3988
3989 class PornotubeIE(InfoExtractor):
3990     """Information extractor for pornotube.com."""
3991     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3992
3993     def _real_extract(self, url):
3994         mobj = re.match(self._VALID_URL, url)
3995         if mobj is None:
3996             self._downloader.report_error(u'invalid URL: %s' % url)
3997             return
3998
3999         video_id = mobj.group('videoid')
4000         video_title = mobj.group('title')
4001
4002         # Get webpage content
4003         webpage = self._download_webpage(url, video_id)
4004
4005         # Get the video URL
4006         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4007         result = re.search(VIDEO_URL_RE, webpage)
4008         if result is None:
4009             self._downloader.report_error(u'unable to extract video url')
4010             return
4011         video_url = compat_urllib_parse.unquote(result.group('url'))
4012
4013         #Get the uploaded date
4014         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4015         result = re.search(VIDEO_UPLOADED_RE, webpage)
4016         if result is None:
4017             self._downloader.report_error(u'unable to extract video title')
4018             return
4019         upload_date = result.group('date')
4020
4021         info = {'id': video_id,
4022                 'url': video_url,
4023                 'uploader': None,
4024                 'upload_date': upload_date,
4025                 'title': video_title,
4026                 'ext': 'flv',
4027                 'format': 'flv'}
4028
4029         return [info]
4030
4031 class YouJizzIE(InfoExtractor):
4032     """Information extractor for youjizz.com."""
4033     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4034
4035     def _real_extract(self, url):
4036         mobj = re.match(self._VALID_URL, url)
4037         if mobj is None:
4038             self._downloader.report_error(u'invalid URL: %s' % url)
4039             return
4040
4041         video_id = mobj.group('videoid')
4042
4043         # Get webpage content
4044         webpage = self._download_webpage(url, video_id)
4045
4046         # Get the video title
4047         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4048         if result is None:
4049             raise ExtractorError(u'ERROR: unable to extract video title')
4050         video_title = result.group('title').strip()
4051
4052         # Get the embed page
4053         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4054         if result is None:
4055             raise ExtractorError(u'ERROR: unable to extract embed page')
4056
4057         embed_page_url = result.group(0).strip()
4058         video_id = result.group('videoid')
4059
4060         webpage = self._download_webpage(embed_page_url, video_id)
4061
4062         # Get the video URL
4063         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4064         if result is None:
4065             raise ExtractorError(u'ERROR: unable to extract video url')
4066         video_url = result.group('source')
4067
4068         info = {'id': video_id,
4069                 'url': video_url,
4070                 'title': video_title,
4071                 'ext': 'flv',
4072                 'format': 'flv',
4073                 'player_url': embed_page_url}
4074
4075         return [info]
4076
4077 class EightTracksIE(InfoExtractor):
4078     IE_NAME = '8tracks'
4079     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4080
4081     def _real_extract(self, url):
4082         mobj = re.match(self._VALID_URL, url)
4083         if mobj is None:
4084             raise ExtractorError(u'Invalid URL: %s' % url)
4085         playlist_id = mobj.group('id')
4086
4087         webpage = self._download_webpage(url, playlist_id)
4088
4089         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4090         if not m:
4091             raise ExtractorError(u'Cannot find trax information')
4092         json_like = m.group(1)
4093         data = json.loads(json_like)
4094
4095         session = str(random.randint(0, 1000000000))
4096         mix_id = data['id']
4097         track_count = data['tracks_count']
4098         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4099         next_url = first_url
4100         res = []
4101         for i in itertools.count():
4102             api_json = self._download_webpage(next_url, playlist_id,
4103                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4104                 errnote=u'Failed to download song information')
4105             api_data = json.loads(api_json)
4106             track_data = api_data[u'set']['track']
4107             info = {
4108                 'id': track_data['id'],
4109                 'url': track_data['track_file_stream_url'],
4110                 'title': track_data['performer'] + u' - ' + track_data['name'],
4111                 'raw_title': track_data['name'],
4112                 'uploader_id': data['user']['login'],
4113                 'ext': 'm4a',
4114             }
4115             res.append(info)
4116             if api_data['set']['at_last_track']:
4117                 break
4118             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4119         return res
4120
4121 class KeekIE(InfoExtractor):
4122     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4123     IE_NAME = u'keek'
4124
4125     def _real_extract(self, url):
4126         m = re.match(self._VALID_URL, url)
4127         video_id = m.group('videoID')
4128         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4129         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4130         webpage = self._download_webpage(url, video_id)
4131         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4132         title = unescapeHTML(m.group('title'))
4133         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4134         uploader = clean_html(m.group('uploader'))
4135         info = {
4136                 'id': video_id,
4137                 'url': video_url,
4138                 'ext': 'mp4',
4139                 'title': title,
4140                 'thumbnail': thumbnail,
4141                 'uploader': uploader
4142         }
4143         return [info]
4144
4145 class TEDIE(InfoExtractor):
4146     _VALID_URL=r'''http://www.ted.com/
4147                    (
4148                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4149                         |
4150                         ((?P<type_talk>talks)) # We have a simple talk
4151                    )
4152                    /(?P<name>\w+) # Here goes the name and then ".html"
4153                    '''
4154
4155     @classmethod
4156     def suitable(cls, url):
4157         """Receives a URL and returns True if suitable for this IE."""
4158         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4159
4160     def _real_extract(self, url):
4161         m=re.match(self._VALID_URL, url, re.VERBOSE)
4162         if m.group('type_talk'):
4163             return [self._talk_info(url)]
4164         else :
4165             playlist_id=m.group('playlist_id')
4166             name=m.group('name')
4167             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4168             return self._playlist_videos_info(url,name,playlist_id)
4169
4170     def _talk_video_link(self,mediaSlug):
4171         '''Returns the video link for that mediaSlug'''
4172         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4173
4174     def _playlist_videos_info(self,url,name,playlist_id=0):
4175         '''Returns the videos of the playlist'''
4176         video_RE=r'''
4177                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4178                      ([.\s]*?)data-playlist_item_id="(\d+)"
4179                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4180                      '''
4181         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4182         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4183         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4184         m_names=re.finditer(video_name_RE,webpage)
4185         info=[]
4186         for m_video, m_name in zip(m_videos,m_names):
4187             video_id=m_video.group('video_id')
4188             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4189             info.append(self._talk_info(talk_url,video_id))
4190         return info
4191
4192     def _talk_info(self, url, video_id=0):
4193         """Return the video for the talk in the url"""
4194         m=re.match(self._VALID_URL, url,re.VERBOSE)
4195         videoName=m.group('name')
4196         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4197         # If the url includes the language we get the title translated
4198         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4199         title=re.search(title_RE, webpage).group('title')
4200         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4201                         "id":(?P<videoID>[\d]+).*?
4202                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4203         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4204         thumb_match=re.search(thumb_RE,webpage)
4205         info_match=re.search(info_RE,webpage,re.VERBOSE)
4206         video_id=info_match.group('videoID')
4207         mediaSlug=info_match.group('mediaSlug')
4208         video_url=self._talk_video_link(mediaSlug)
4209         info = {
4210                 'id': video_id,
4211                 'url': video_url,
4212                 'ext': 'mp4',
4213                 'title': title,
4214                 'thumbnail': thumb_match.group('thumbnail')
4215                 }
4216         return info
4217
4218 class MySpassIE(InfoExtractor):
4219     _VALID_URL = r'http://www.myspass.de/.*'
4220
4221     def _real_extract(self, url):
4222         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4223
4224         # video id is the last path element of the URL
4225         # usually there is a trailing slash, so also try the second but last
4226         url_path = compat_urllib_parse_urlparse(url).path
4227         url_parent_path, video_id = os.path.split(url_path)
4228         if not video_id:
4229             _, video_id = os.path.split(url_parent_path)
4230
4231         # get metadata
4232         metadata_url = META_DATA_URL_TEMPLATE % video_id
4233         metadata_text = self._download_webpage(metadata_url, video_id)
4234         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4235
4236         # extract values from metadata
4237         url_flv_el = metadata.find('url_flv')
4238         if url_flv_el is None:
4239             self._downloader.report_error(u'unable to extract download url')
4240             return
4241         video_url = url_flv_el.text
4242         extension = os.path.splitext(video_url)[1][1:]
4243         title_el = metadata.find('title')
4244         if title_el is None:
4245             self._downloader.report_error(u'unable to extract title')
4246             return
4247         title = title_el.text
4248         format_id_el = metadata.find('format_id')
4249         if format_id_el is None:
4250             format = ext
4251         else:
4252             format = format_id_el.text
4253         description_el = metadata.find('description')
4254         if description_el is not None:
4255             description = description_el.text
4256         else:
4257             description = None
4258         imagePreview_el = metadata.find('imagePreview')
4259         if imagePreview_el is not None:
4260             thumbnail = imagePreview_el.text
4261         else:
4262             thumbnail = None
4263         info = {
4264             'id': video_id,
4265             'url': video_url,
4266             'title': title,
4267             'ext': extension,
4268             'format': format,
4269             'thumbnail': thumbnail,
4270             'description': description
4271         }
4272         return [info]
4273
4274 class SpiegelIE(InfoExtractor):
4275     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4276
4277     def _real_extract(self, url):
4278         m = re.match(self._VALID_URL, url)
4279         video_id = m.group('videoID')
4280
4281         webpage = self._download_webpage(url, video_id)
4282         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4283         if not m:
4284             raise ExtractorError(u'Cannot find title')
4285         video_title = unescapeHTML(m.group(1))
4286
4287         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4288         xml_code = self._download_webpage(xml_url, video_id,
4289                     note=u'Downloading XML', errnote=u'Failed to download XML')
4290
4291         idoc = xml.etree.ElementTree.fromstring(xml_code)
4292         last_type = idoc[-1]
4293         filename = last_type.findall('./filename')[0].text
4294         duration = float(last_type.findall('./duration')[0].text)
4295
4296         video_url = 'http://video2.spiegel.de/flash/' + filename
4297         video_ext = filename.rpartition('.')[2]
4298         info = {
4299             'id': video_id,
4300             'url': video_url,
4301             'ext': video_ext,
4302             'title': video_title,
4303             'duration': duration,
4304         }
4305         return [info]
4306
4307 class LiveLeakIE(InfoExtractor):
4308
4309     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4310     IE_NAME = u'liveleak'
4311
4312     def _real_extract(self, url):
4313         mobj = re.match(self._VALID_URL, url)
4314         if mobj is None:
4315             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4316             return
4317
4318         video_id = mobj.group('video_id')
4319
4320         webpage = self._download_webpage(url, video_id)
4321
4322         m = re.search(r'file: "(.*?)",', webpage)
4323         if not m:
4324             self._downloader.report_error(u'unable to find video url')
4325             return
4326         video_url = m.group(1)
4327
4328         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4329         if not m:
4330             self._downloader.trouble(u'Cannot find video title')
4331         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4332
4333         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4334         if m:
4335             desc = unescapeHTML(m.group('desc'))
4336         else:
4337             desc = None
4338
4339         m = re.search(r'By:.*?(\w+)</a>', webpage)
4340         if m:
4341             uploader = clean_html(m.group(1))
4342         else:
4343             uploader = None
4344
4345         info = {
4346             'id':  video_id,
4347             'url': video_url,
4348             'ext': 'mp4',
4349             'title': title,
4350             'description': desc,
4351             'uploader': uploader
4352         }
4353
4354         return [info]
4355
4356 class ARDIE(InfoExtractor):
4357     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4358     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4359     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4360
4361     def _real_extract(self, url):
4362         # determine video id from url
4363         m = re.match(self._VALID_URL, url)
4364
4365         numid = re.search(r'documentId=([0-9]+)', url)
4366         if numid:
4367             video_id = numid.group(1)
4368         else:
4369             video_id = m.group('video_id')
4370
4371         # determine title and media streams from webpage
4372         html = self._download_webpage(url, video_id)
4373         title = re.search(self._TITLE, html).group('title')
4374         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4375         if not streams:
4376             assert '"fsk"' in html
4377             self._downloader.report_error(u'this video is only available after 8:00 pm')
4378             return
4379
4380         # choose default media type and highest quality for now
4381         stream = max([s for s in streams if int(s["media_type"]) == 0],
4382                      key=lambda s: int(s["quality"]))
4383
4384         # there's two possibilities: RTMP stream or HTTP download
4385         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4386         if stream['rtmp_url']:
4387             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4388             assert stream['video_url'].startswith('mp4:')
4389             info["url"] = stream["rtmp_url"]
4390             info["play_path"] = stream['video_url']
4391         else:
4392             assert stream["video_url"].endswith('.mp4')
4393             info["url"] = stream["video_url"]
4394         return [info]
4395
4396
4397 def gen_extractors():
4398     """ Return a list of an instance of every supported extractor.
4399     The order does matter; the first extractor matched is the one handling the URL.
4400     """
4401     return [
4402         YoutubePlaylistIE(),
4403         YoutubeChannelIE(),
4404         YoutubeUserIE(),
4405         YoutubeSearchIE(),
4406         YoutubeIE(),
4407         MetacafeIE(),
4408         DailymotionIE(),
4409         GoogleSearchIE(),
4410         PhotobucketIE(),
4411         YahooIE(),
4412         YahooSearchIE(),
4413         DepositFilesIE(),
4414         FacebookIE(),
4415         BlipTVUserIE(),
4416         BlipTVIE(),
4417         VimeoIE(),
4418         MyVideoIE(),
4419         ComedyCentralIE(),
4420         EscapistIE(),
4421         CollegeHumorIE(),
4422         XVideosIE(),
4423         SoundcloudSetIE(),
4424         SoundcloudIE(),
4425         InfoQIE(),
4426         MixcloudIE(),
4427         StanfordOpenClassroomIE(),
4428         MTVIE(),
4429         YoukuIE(),
4430         XNXXIE(),
4431         YouJizzIE(),
4432         PornotubeIE(),
4433         YouPornIE(),
4434         GooglePlusIE(),
4435         ArteTvIE(),
4436         NBAIE(),
4437         WorldStarHipHopIE(),
4438         JustinTVIE(),
4439         FunnyOrDieIE(),
4440         SteamIE(),
4441         UstreamIE(),
4442         RBMARadioIE(),
4443         EightTracksIE(),
4444         KeekIE(),
4445         TEDIE(),
4446         MySpassIE(),
4447         SpiegelIE(),
4448         LiveLeakIE(),
4449         ARDIE(),
4450         GenericIE()
4451     ]