Write the method to_screen in InfoExtractor (related #608)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147     def to_screen(self, msg):
148         """Print msg to screen, prefixing it with '[ie_name]'"""
149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151     #Methods for following #608
152     #They set the correct value of the '_type' key
153     def video_result(self, video_info):
154         """Returns a video"""
155         video_info['_type'] = 'video'
156         return video_info
157     def url_result(self, url, ie=None):
158         """Returns a url that points to a page that should be processed"""
159         #TODO: ie should be the class used for getting the info
160         video_info = {'_type': 'url',
161                       'url': url,
162                       'ie_key': ie}
163         return video_info
164     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
165         """Returns a playlist"""
166         video_info = {'_type': 'playlist',
167                       'entries': entries}
168         if playlist_id:
169             video_info['id'] = playlist_id
170         if playlist_title:
171             video_info['title'] = playlist_title
172         return video_info
173
174
175 class YoutubeIE(InfoExtractor):
176     """Information extractor for youtube.com."""
177
178     _VALID_URL = r"""^
179                      (
180                          (?:https?://)?                                       # http(s):// (optional)
181                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
182                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
183                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
184                          (?:                                                  # the various things that can precede the ID:
185                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
186                              |(?:                                             # or the v= param in all its forms
187                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
188                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
189                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
190                                  v=
191                              )
192                          )?                                                   # optional -> youtube.com/xxxx is OK
193                      )?                                                       # all until now is optional -> you can pass the naked ID
194                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
195                      (?(1).+)?                                                # if we found the ID, everything can follow
196                      $"""
197     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
198     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
199     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
200     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
201     _NETRC_MACHINE = 'youtube'
202     # Listed in order of quality
203     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
204     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
205     _video_extensions = {
206         '13': '3gp',
207         '17': 'mp4',
208         '18': 'mp4',
209         '22': 'mp4',
210         '37': 'mp4',
211         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
212         '43': 'webm',
213         '44': 'webm',
214         '45': 'webm',
215         '46': 'webm',
216     }
217     _video_dimensions = {
218         '5': '240x400',
219         '6': '???',
220         '13': '???',
221         '17': '144x176',
222         '18': '360x640',
223         '22': '720x1280',
224         '34': '360x640',
225         '35': '480x854',
226         '37': '1080x1920',
227         '38': '3072x4096',
228         '43': '360x640',
229         '44': '480x854',
230         '45': '720x1280',
231         '46': '1080x1920',
232     }
233     IE_NAME = u'youtube'
234
235     @classmethod
236     def suitable(cls, url):
237         """Receives a URL and returns True if suitable for this IE."""
238         if YoutubePlaylistIE.suitable(url): return False
239         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
240
241     def report_lang(self):
242         """Report attempt to set language."""
243         self.to_screen(u'Setting language')
244
245     def report_login(self):
246         """Report attempt to log in."""
247         self.to_screen(u'Logging in')
248
249     def report_age_confirmation(self):
250         """Report attempt to confirm age."""
251         self.to_screen(u'Confirming age')
252
253     def report_video_webpage_download(self, video_id):
254         """Report attempt to download video webpage."""
255         self.to_screen(u'%s: Downloading video webpage' % video_id)
256
257     def report_video_info_webpage_download(self, video_id):
258         """Report attempt to download video info webpage."""
259         self.to_screen(u'%s: Downloading video info webpage' % video_id)
260
261     def report_video_subtitles_download(self, video_id):
262         """Report attempt to download video info webpage."""
263         self.to_screen(u'%s: Checking available subtitles' % video_id)
264
265     def report_video_subtitles_request(self, video_id, sub_lang, format):
266         """Report attempt to download video info webpage."""
267         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
268
269     def report_video_subtitles_available(self, video_id, sub_lang_list):
270         """Report available subtitles."""
271         sub_lang = ",".join(list(sub_lang_list.keys()))
272         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
273
274     def report_information_extraction(self, video_id):
275         """Report attempt to extract video information."""
276         self.to_screen(u'%s: Extracting video information' % video_id)
277
278     def report_unavailable_format(self, video_id, format):
279         """Report extracted video URL."""
280         self.to_screen(u'%s: Format %s not available' % (video_id, format))
281
282     def report_rtmp_download(self):
283         """Indicate the download will use the RTMP protocol."""
284         self.to_screen(u'RTMP download detected')
285
286     def _get_available_subtitles(self, video_id):
287         self.report_video_subtitles_download(video_id)
288         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
289         try:
290             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292             return (u'unable to download video subtitles: %s' % compat_str(err), None)
293         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
294         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
295         if not sub_lang_list:
296             return (u'video doesn\'t have subtitles', None)
297         return sub_lang_list
298
299     def _list_available_subtitles(self, video_id):
300         sub_lang_list = self._get_available_subtitles(video_id)
301         self.report_video_subtitles_available(video_id, sub_lang_list)
302
303     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
304         """
305         Return tuple:
306         (error_message, sub_lang, sub)
307         """
308         self.report_video_subtitles_request(video_id, sub_lang, format)
309         params = compat_urllib_parse.urlencode({
310             'lang': sub_lang,
311             'name': sub_name,
312             'v': video_id,
313             'fmt': format,
314         })
315         url = 'http://www.youtube.com/api/timedtext?' + params
316         try:
317             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
318         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
319             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
320         if not sub:
321             return (u'Did not fetch video subtitles', None, None)
322         return (None, sub_lang, sub)
323
324     def _extract_subtitle(self, video_id):
325         """
326         Return a list with a tuple:
327         [(error_message, sub_lang, sub)]
328         """
329         sub_lang_list = self._get_available_subtitles(video_id)
330         sub_format = self._downloader.params.get('subtitlesformat')
331         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
332             return [(sub_lang_list[0], None, None)]
333         if self._downloader.params.get('subtitleslang', False):
334             sub_lang = self._downloader.params.get('subtitleslang')
335         elif 'en' in sub_lang_list:
336             sub_lang = 'en'
337         else:
338             sub_lang = list(sub_lang_list.keys())[0]
339         if not sub_lang in sub_lang_list:
340             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
341
342         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
343         return [subtitle]
344
345     def _extract_all_subtitles(self, video_id):
346         sub_lang_list = self._get_available_subtitles(video_id)
347         sub_format = self._downloader.params.get('subtitlesformat')
348         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
349             return [(sub_lang_list[0], None, None)]
350         subtitles = []
351         for sub_lang in sub_lang_list:
352             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353             subtitles.append(subtitle)
354         return subtitles
355
356     def _print_formats(self, formats):
357         print('Available formats:')
358         for x in formats:
359             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
360
361     def _real_initialize(self):
362         if self._downloader is None:
363             return
364
365         username = None
366         password = None
367         downloader_params = self._downloader.params
368
369         # Attempt to use provided username and password or .netrc data
370         if downloader_params.get('username', None) is not None:
371             username = downloader_params['username']
372             password = downloader_params['password']
373         elif downloader_params.get('usenetrc', False):
374             try:
375                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
376                 if info is not None:
377                     username = info[0]
378                     password = info[2]
379                 else:
380                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
381             except (IOError, netrc.NetrcParseError) as err:
382                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
383                 return
384
385         # Set language
386         request = compat_urllib_request.Request(self._LANG_URL)
387         try:
388             self.report_lang()
389             compat_urllib_request.urlopen(request).read()
390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
392             return
393
394         # No authentication to be performed
395         if username is None:
396             return
397
398         request = compat_urllib_request.Request(self._LOGIN_URL)
399         try:
400             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
401         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
403             return
404
405         galx = None
406         dsh = None
407         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
408         if match:
409           galx = match.group(1)
410
411         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
412         if match:
413           dsh = match.group(1)
414
415         # Log in
416         login_form_strs = {
417                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
418                 u'Email': username,
419                 u'GALX': galx,
420                 u'Passwd': password,
421                 u'PersistentCookie': u'yes',
422                 u'_utf8': u'霱',
423                 u'bgresponse': u'js_disabled',
424                 u'checkConnection': u'',
425                 u'checkedDomains': u'youtube',
426                 u'dnConn': u'',
427                 u'dsh': dsh,
428                 u'pstMsg': u'0',
429                 u'rmShown': u'1',
430                 u'secTok': u'',
431                 u'signIn': u'Sign in',
432                 u'timeStmp': u'',
433                 u'service': u'youtube',
434                 u'uilel': u'3',
435                 u'hl': u'en_US',
436         }
437         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
438         # chokes on unicode
439         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
440         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
441         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
442         try:
443             self.report_login()
444             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
445             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
446                 self._downloader.report_warning(u'unable to log in: bad username or password')
447                 return
448         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
449             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
450             return
451
452         # Confirm age
453         age_form = {
454                 'next_url':     '/',
455                 'action_confirm':   'Confirm',
456                 }
457         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
458         try:
459             self.report_age_confirmation()
460             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
463             return
464
465     def _extract_id(self, url):
466         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
467         if mobj is None:
468             self._downloader.report_error(u'invalid URL: %s' % url)
469             return
470         video_id = mobj.group(2)
471         return video_id
472
473     def _real_extract(self, url):
474         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
475         mobj = re.search(self._NEXT_URL_RE, url)
476         if mobj:
477             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
478         video_id = self._extract_id(url)
479
480         # Get video webpage
481         self.report_video_webpage_download(video_id)
482         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
483         request = compat_urllib_request.Request(url)
484         try:
485             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
486         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
487             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
488             return
489
490         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
491
492         # Attempt to extract SWF player URL
493         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
494         if mobj is not None:
495             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
496         else:
497             player_url = None
498
499         # Get video info
500         self.report_video_info_webpage_download(video_id)
501         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
502             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
503                     % (video_id, el_type))
504             video_info_webpage = self._download_webpage(video_info_url, video_id,
505                                     note=False,
506                                     errnote='unable to download video info webpage')
507             video_info = compat_parse_qs(video_info_webpage)
508             if 'token' in video_info:
509                 break
510         if 'token' not in video_info:
511             if 'reason' in video_info:
512                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
513             else:
514                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
515             return
516
517         # Check for "rental" videos
518         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
519             self._downloader.report_error(u'"rental" videos not supported')
520             return
521
522         # Start extracting information
523         self.report_information_extraction(video_id)
524
525         # uploader
526         if 'author' not in video_info:
527             self._downloader.report_error(u'unable to extract uploader name')
528             return
529         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
530
531         # uploader_id
532         video_uploader_id = None
533         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
534         if mobj is not None:
535             video_uploader_id = mobj.group(1)
536         else:
537             self._downloader.report_warning(u'unable to extract uploader nickname')
538
539         # title
540         if 'title' not in video_info:
541             self._downloader.report_error(u'unable to extract video title')
542             return
543         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
544
545         # thumbnail image
546         if 'thumbnail_url' not in video_info:
547             self._downloader.report_warning(u'unable to extract video thumbnail')
548             video_thumbnail = ''
549         else:   # don't panic if we can't find it
550             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
551
552         # upload date
553         upload_date = None
554         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
555         if mobj is not None:
556             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
557             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
558             for expression in format_expressions:
559                 try:
560                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
561                 except:
562                     pass
563
564         # description
565         video_description = get_element_by_id("eow-description", video_webpage)
566         if video_description:
567             video_description = clean_html(video_description)
568         else:
569             video_description = ''
570
571         # subtitles
572         video_subtitles = None
573
574         if self._downloader.params.get('writesubtitles', False):
575             video_subtitles = self._extract_subtitle(video_id)
576             if video_subtitles:
577                 (sub_error, sub_lang, sub) = video_subtitles[0]
578                 if sub_error:
579                     self._downloader.report_error(sub_error)
580
581         if self._downloader.params.get('allsubtitles', False):
582             video_subtitles = self._extract_all_subtitles(video_id)
583             for video_subtitle in video_subtitles:
584                 (sub_error, sub_lang, sub) = video_subtitle
585                 if sub_error:
586                     self._downloader.report_error(sub_error)
587
588         if self._downloader.params.get('listsubtitles', False):
589             sub_lang_list = self._list_available_subtitles(video_id)
590             return
591
592         if 'length_seconds' not in video_info:
593             self._downloader.report_warning(u'unable to extract video duration')
594             video_duration = ''
595         else:
596             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
597
598         # token
599         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
600
601         # Decide which formats to download
602         req_format = self._downloader.params.get('format', None)
603
604         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
605             self.report_rtmp_download()
606             video_url_list = [(None, video_info['conn'][0])]
607         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
608             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
609             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
610             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
611             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
612
613             format_limit = self._downloader.params.get('format_limit', None)
614             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
615             if format_limit is not None and format_limit in available_formats:
616                 format_list = available_formats[available_formats.index(format_limit):]
617             else:
618                 format_list = available_formats
619             existing_formats = [x for x in format_list if x in url_map]
620             if len(existing_formats) == 0:
621                 self._downloader.report_error(u'no known formats available for video')
622                 return
623             if self._downloader.params.get('listformats', None):
624                 self._print_formats(existing_formats)
625                 return
626             if req_format is None or req_format == 'best':
627                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
628             elif req_format == 'worst':
629                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
630             elif req_format in ('-1', 'all'):
631                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
632             else:
633                 # Specific formats. We pick the first in a slash-delimeted sequence.
634                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
635                 req_formats = req_format.split('/')
636                 video_url_list = None
637                 for rf in req_formats:
638                     if rf in url_map:
639                         video_url_list = [(rf, url_map[rf])]
640                         break
641                 if video_url_list is None:
642                     self._downloader.report_error(u'requested format not available')
643                     return
644         else:
645             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
646             return
647
648         results = []
649         for format_param, video_real_url in video_url_list:
650             # Extension
651             video_extension = self._video_extensions.get(format_param, 'flv')
652
653             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
654                                               self._video_dimensions.get(format_param, '???'))
655
656             results.append({
657                 'id':       video_id,
658                 'url':      video_real_url,
659                 'uploader': video_uploader,
660                 'uploader_id': video_uploader_id,
661                 'upload_date':  upload_date,
662                 'title':    video_title,
663                 'ext':      video_extension,
664                 'format':   video_format,
665                 'thumbnail':    video_thumbnail,
666                 'description':  video_description,
667                 'player_url':   player_url,
668                 'subtitles':    video_subtitles,
669                 'duration':     video_duration
670             })
671         return results
672
673
674 class MetacafeIE(InfoExtractor):
675     """Information Extractor for metacafe.com."""
676
677     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
678     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
679     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
680     IE_NAME = u'metacafe'
681
682     def __init__(self, downloader=None):
683         InfoExtractor.__init__(self, downloader)
684
685     def report_disclaimer(self):
686         """Report disclaimer retrieval."""
687         self.to_screen(u'Retrieving disclaimer')
688
689     def report_age_confirmation(self):
690         """Report attempt to confirm age."""
691         self.to_screen(u'Confirming age')
692
693     def report_download_webpage(self, video_id):
694         """Report webpage download."""
695         self.to_screen(u'%s: Downloading webpage' % video_id)
696
697     def report_extraction(self, video_id):
698         """Report information extraction."""
699         self.to_screen(u'%s: Extracting information' % video_id)
700
701     def _real_initialize(self):
702         # Retrieve disclaimer
703         request = compat_urllib_request.Request(self._DISCLAIMER)
704         try:
705             self.report_disclaimer()
706             disclaimer = compat_urllib_request.urlopen(request).read()
707         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
708             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
709             return
710
711         # Confirm age
712         disclaimer_form = {
713             'filters': '0',
714             'submit': "Continue - I'm over 18",
715             }
716         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
717         try:
718             self.report_age_confirmation()
719             disclaimer = compat_urllib_request.urlopen(request).read()
720         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
721             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
722             return
723
724     def _real_extract(self, url):
725         # Extract id and simplified title from URL
726         mobj = re.match(self._VALID_URL, url)
727         if mobj is None:
728             self._downloader.report_error(u'invalid URL: %s' % url)
729             return
730
731         video_id = mobj.group(1)
732
733         # Check if video comes from YouTube
734         mobj2 = re.match(r'^yt-(.*)$', video_id)
735         if mobj2 is not None:
736             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
737
738         # Retrieve video webpage to extract further information
739         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
740
741         # Extract URL, uploader and title from webpage
742         self.report_extraction(video_id)
743         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
744         if mobj is not None:
745             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
746             video_extension = mediaURL[-3:]
747
748             # Extract gdaKey if available
749             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
750             if mobj is None:
751                 video_url = mediaURL
752             else:
753                 gdaKey = mobj.group(1)
754                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
755         else:
756             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
757             if mobj is None:
758                 self._downloader.report_error(u'unable to extract media URL')
759                 return
760             vardict = compat_parse_qs(mobj.group(1))
761             if 'mediaData' not in vardict:
762                 self._downloader.report_error(u'unable to extract media URL')
763                 return
764             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
765             if mobj is None:
766                 self._downloader.report_error(u'unable to extract media URL')
767                 return
768             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
769             video_extension = mediaURL[-3:]
770             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
771
772         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
773         if mobj is None:
774             self._downloader.report_error(u'unable to extract title')
775             return
776         video_title = mobj.group(1).decode('utf-8')
777
778         mobj = re.search(r'submitter=(.*?);', webpage)
779         if mobj is None:
780             self._downloader.report_error(u'unable to extract uploader nickname')
781             return
782         video_uploader = mobj.group(1)
783
784         return [{
785             'id':       video_id.decode('utf-8'),
786             'url':      video_url.decode('utf-8'),
787             'uploader': video_uploader.decode('utf-8'),
788             'upload_date':  None,
789             'title':    video_title,
790             'ext':      video_extension.decode('utf-8'),
791         }]
792
793
794 class DailymotionIE(InfoExtractor):
795     """Information Extractor for Dailymotion"""
796
797     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
798     IE_NAME = u'dailymotion'
799     _WORKING = False
800
801     def __init__(self, downloader=None):
802         InfoExtractor.__init__(self, downloader)
803
804     def report_extraction(self, video_id):
805         """Report information extraction."""
806         self.to_screen(u'%s: Extracting information' % video_id)
807
808     def _real_extract(self, url):
809         # Extract id and simplified title from URL
810         mobj = re.match(self._VALID_URL, url)
811         if mobj is None:
812             self._downloader.report_error(u'invalid URL: %s' % url)
813             return
814
815         video_id = mobj.group(1).split('_')[0].split('?')[0]
816
817         video_extension = 'mp4'
818
819         # Retrieve video webpage to extract further information
820         request = compat_urllib_request.Request(url)
821         request.add_header('Cookie', 'family_filter=off')
822         webpage = self._download_webpage(request, video_id)
823
824         # Extract URL, uploader and title from webpage
825         self.report_extraction(video_id)
826         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
827         if mobj is None:
828             self._downloader.report_error(u'unable to extract media URL')
829             return
830         flashvars = compat_urllib_parse.unquote(mobj.group(1))
831
832         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
833             if key in flashvars:
834                 max_quality = key
835                 self.to_screen(u'Using %s' % key)
836                 break
837         else:
838             self._downloader.report_error(u'unable to extract video URL')
839             return
840
841         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
842         if mobj is None:
843             self._downloader.report_error(u'unable to extract video URL')
844             return
845
846         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
847
848         # TODO: support choosing qualities
849
850         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
851         if mobj is None:
852             self._downloader.report_error(u'unable to extract title')
853             return
854         video_title = unescapeHTML(mobj.group('title'))
855
856         video_uploader = None
857         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
858         if mobj is None:
859             # lookin for official user
860             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
861             if mobj_official is None:
862                 self._downloader.report_warning(u'unable to extract uploader nickname')
863             else:
864                 video_uploader = mobj_official.group(1)
865         else:
866             video_uploader = mobj.group(1)
867
868         video_upload_date = None
869         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
870         if mobj is not None:
871             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872
873         return [{
874             'id':       video_id,
875             'url':      video_url,
876             'uploader': video_uploader,
877             'upload_date':  video_upload_date,
878             'title':    video_title,
879             'ext':      video_extension,
880         }]
881
882
883 class PhotobucketIE(InfoExtractor):
884     """Information extractor for photobucket.com."""
885
886     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
887     IE_NAME = u'photobucket'
888
889     def __init__(self, downloader=None):
890         InfoExtractor.__init__(self, downloader)
891
892     def report_download_webpage(self, video_id):
893         """Report webpage download."""
894         self.to_screen(u'%s: Downloading webpage' % video_id)
895
896     def report_extraction(self, video_id):
897         """Report information extraction."""
898         self.to_screen(u'%s: Extracting information' % video_id)
899
900     def _real_extract(self, url):
901         # Extract id from URL
902         mobj = re.match(self._VALID_URL, url)
903         if mobj is None:
904             self._downloader.report_error(u'Invalid URL: %s' % url)
905             return
906
907         video_id = mobj.group(1)
908
909         video_extension = 'flv'
910
911         # Retrieve video webpage to extract further information
912         request = compat_urllib_request.Request(url)
913         try:
914             self.report_download_webpage(video_id)
915             webpage = compat_urllib_request.urlopen(request).read()
916         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
917             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
918             return
919
920         # Extract URL, uploader, and title from webpage
921         self.report_extraction(video_id)
922         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
923         if mobj is None:
924             self._downloader.report_error(u'unable to extract media URL')
925             return
926         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
927
928         video_url = mediaURL
929
930         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
931         if mobj is None:
932             self._downloader.report_error(u'unable to extract title')
933             return
934         video_title = mobj.group(1).decode('utf-8')
935
936         video_uploader = mobj.group(2).decode('utf-8')
937
938         return [{
939             'id':       video_id.decode('utf-8'),
940             'url':      video_url.decode('utf-8'),
941             'uploader': video_uploader,
942             'upload_date':  None,
943             'title':    video_title,
944             'ext':      video_extension.decode('utf-8'),
945         }]
946
947
948 class YahooIE(InfoExtractor):
949     """Information extractor for video.yahoo.com."""
950
951     _WORKING = False
952     # _VALID_URL matches all Yahoo! Video URLs
953     # _VPAGE_URL matches only the extractable '/watch/' URLs
954     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
955     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
956     IE_NAME = u'video.yahoo'
957
958     def __init__(self, downloader=None):
959         InfoExtractor.__init__(self, downloader)
960
961     def report_download_webpage(self, video_id):
962         """Report webpage download."""
963         self.to_screen(u'%s: Downloading webpage' % video_id)
964
965     def report_extraction(self, video_id):
966         """Report information extraction."""
967         self.to_screen(u'%s: Extracting information' % video_id)
968
969     def _real_extract(self, url, new_video=True):
970         # Extract ID from URL
971         mobj = re.match(self._VALID_URL, url)
972         if mobj is None:
973             self._downloader.report_error(u'Invalid URL: %s' % url)
974             return
975
976         video_id = mobj.group(2)
977         video_extension = 'flv'
978
979         # Rewrite valid but non-extractable URLs as
980         # extractable English language /watch/ URLs
981         if re.match(self._VPAGE_URL, url) is None:
982             request = compat_urllib_request.Request(url)
983             try:
984                 webpage = compat_urllib_request.urlopen(request).read()
985             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
986                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
987                 return
988
989             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
990             if mobj is None:
991                 self._downloader.report_error(u'Unable to extract id field')
992                 return
993             yahoo_id = mobj.group(1)
994
995             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
996             if mobj is None:
997                 self._downloader.report_error(u'Unable to extract vid field')
998                 return
999             yahoo_vid = mobj.group(1)
1000
1001             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1002             return self._real_extract(url, new_video=False)
1003
1004         # Retrieve video webpage to extract further information
1005         request = compat_urllib_request.Request(url)
1006         try:
1007             self.report_download_webpage(video_id)
1008             webpage = compat_urllib_request.urlopen(request).read()
1009         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1010             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1011             return
1012
1013         # Extract uploader and title from webpage
1014         self.report_extraction(video_id)
1015         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video title')
1018             return
1019         video_title = mobj.group(1).decode('utf-8')
1020
1021         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1022         if mobj is None:
1023             self._downloader.report_error(u'unable to extract video uploader')
1024             return
1025         video_uploader = mobj.group(1).decode('utf-8')
1026
1027         # Extract video thumbnail
1028         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1029         if mobj is None:
1030             self._downloader.report_error(u'unable to extract video thumbnail')
1031             return
1032         video_thumbnail = mobj.group(1).decode('utf-8')
1033
1034         # Extract video description
1035         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'unable to extract video description')
1038             return
1039         video_description = mobj.group(1).decode('utf-8')
1040         if not video_description:
1041             video_description = 'No description available.'
1042
1043         # Extract video height and width
1044         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1045         if mobj is None:
1046             self._downloader.report_error(u'unable to extract video height')
1047             return
1048         yv_video_height = mobj.group(1)
1049
1050         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1051         if mobj is None:
1052             self._downloader.report_error(u'unable to extract video width')
1053             return
1054         yv_video_width = mobj.group(1)
1055
1056         # Retrieve video playlist to extract media URL
1057         # I'm not completely sure what all these options are, but we
1058         # seem to need most of them, otherwise the server sends a 401.
1059         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1060         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1061         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1062                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1063                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1064         try:
1065             self.report_download_webpage(video_id)
1066             webpage = compat_urllib_request.urlopen(request).read()
1067         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1068             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1069             return
1070
1071         # Extract media URL from playlist XML
1072         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1073         if mobj is None:
1074             self._downloader.report_error(u'Unable to extract media URL')
1075             return
1076         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1077         video_url = unescapeHTML(video_url)
1078
1079         return [{
1080             'id':       video_id.decode('utf-8'),
1081             'url':      video_url,
1082             'uploader': video_uploader,
1083             'upload_date':  None,
1084             'title':    video_title,
1085             'ext':      video_extension.decode('utf-8'),
1086             'thumbnail':    video_thumbnail.decode('utf-8'),
1087             'description':  video_description,
1088         }]
1089
1090
1091 class VimeoIE(InfoExtractor):
1092     """Information extractor for vimeo.com."""
1093
1094     # _VALID_URL matches Vimeo URLs
1095     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1096     IE_NAME = u'vimeo'
1097
1098     def __init__(self, downloader=None):
1099         InfoExtractor.__init__(self, downloader)
1100
1101     def report_download_webpage(self, video_id):
1102         """Report webpage download."""
1103         self.to_screen(u'%s: Downloading webpage' % video_id)
1104
1105     def report_extraction(self, video_id):
1106         """Report information extraction."""
1107         self.to_screen(u'%s: Extracting information' % video_id)
1108
1109     def _real_extract(self, url, new_video=True):
1110         # Extract ID from URL
1111         mobj = re.match(self._VALID_URL, url)
1112         if mobj is None:
1113             self._downloader.report_error(u'Invalid URL: %s' % url)
1114             return
1115
1116         video_id = mobj.group('id')
1117         if not mobj.group('proto'):
1118             url = 'https://' + url
1119         if mobj.group('direct_link'):
1120             url = 'https://vimeo.com/' + video_id
1121
1122         # Retrieve video webpage to extract further information
1123         request = compat_urllib_request.Request(url, None, std_headers)
1124         try:
1125             self.report_download_webpage(video_id)
1126             webpage_bytes = compat_urllib_request.urlopen(request).read()
1127             webpage = webpage_bytes.decode('utf-8')
1128         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1130             return
1131
1132         # Now we begin extracting as much information as we can from what we
1133         # retrieved. First we extract the information common to all extractors,
1134         # and latter we extract those that are Vimeo specific.
1135         self.report_extraction(video_id)
1136
1137         # Extract the config JSON
1138         try:
1139             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1140             config = json.loads(config)
1141         except:
1142             self._downloader.report_error(u'unable to extract info section')
1143             return
1144
1145         # Extract title
1146         video_title = config["video"]["title"]
1147
1148         # Extract uploader and uploader_id
1149         video_uploader = config["video"]["owner"]["name"]
1150         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1151
1152         # Extract video thumbnail
1153         video_thumbnail = config["video"]["thumbnail"]
1154
1155         # Extract video description
1156         video_description = get_element_by_attribute("itemprop", "description", webpage)
1157         if video_description: video_description = clean_html(video_description)
1158         else: video_description = u''
1159
1160         # Extract upload date
1161         video_upload_date = None
1162         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1163         if mobj is not None:
1164             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1165
1166         # Vimeo specific: extract request signature and timestamp
1167         sig = config['request']['signature']
1168         timestamp = config['request']['timestamp']
1169
1170         # Vimeo specific: extract video codec and quality information
1171         # First consider quality, then codecs, then take everything
1172         # TODO bind to format param
1173         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1174         files = { 'hd': [], 'sd': [], 'other': []}
1175         for codec_name, codec_extension in codecs:
1176             if codec_name in config["video"]["files"]:
1177                 if 'hd' in config["video"]["files"][codec_name]:
1178                     files['hd'].append((codec_name, codec_extension, 'hd'))
1179                 elif 'sd' in config["video"]["files"][codec_name]:
1180                     files['sd'].append((codec_name, codec_extension, 'sd'))
1181                 else:
1182                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1183
1184         for quality in ('hd', 'sd', 'other'):
1185             if len(files[quality]) > 0:
1186                 video_quality = files[quality][0][2]
1187                 video_codec = files[quality][0][0]
1188                 video_extension = files[quality][0][1]
1189                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1190                 break
1191         else:
1192             self._downloader.report_error(u'no known codec found')
1193             return
1194
1195         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1196                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197
1198         return [{
1199             'id':       video_id,
1200             'url':      video_url,
1201             'uploader': video_uploader,
1202             'uploader_id': video_uploader_id,
1203             'upload_date':  video_upload_date,
1204             'title':    video_title,
1205             'ext':      video_extension,
1206             'thumbnail':    video_thumbnail,
1207             'description':  video_description,
1208         }]
1209
1210
1211 class ArteTvIE(InfoExtractor):
1212     """arte.tv information extractor."""
1213
1214     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1215     _LIVE_URL = r'index-[0-9]+\.html$'
1216
1217     IE_NAME = u'arte.tv'
1218
1219     def __init__(self, downloader=None):
1220         InfoExtractor.__init__(self, downloader)
1221
1222     def report_download_webpage(self, video_id):
1223         """Report webpage download."""
1224         self.to_screen(u'%s: Downloading webpage' % video_id)
1225
1226     def report_extraction(self, video_id):
1227         """Report information extraction."""
1228         self.to_screen(u'%s: Extracting information' % video_id)
1229
1230     def fetch_webpage(self, url):
1231         request = compat_urllib_request.Request(url)
1232         try:
1233             self.report_download_webpage(url)
1234             webpage = compat_urllib_request.urlopen(request).read()
1235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1236             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1237             return
1238         except ValueError as err:
1239             self._downloader.report_error(u'Invalid URL: %s' % url)
1240             return
1241         return webpage
1242
1243     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1244         page = self.fetch_webpage(url)
1245         mobj = re.search(regex, page, regexFlags)
1246         info = {}
1247
1248         if mobj is None:
1249             self._downloader.report_error(u'Invalid URL: %s' % url)
1250             return
1251
1252         for (i, key, err) in matchTuples:
1253             if mobj.group(i) is None:
1254                 self._downloader.trouble(err)
1255                 return
1256             else:
1257                 info[key] = mobj.group(i)
1258
1259         return info
1260
1261     def extractLiveStream(self, url):
1262         video_lang = url.split('/')[-4]
1263         info = self.grep_webpage(
1264             url,
1265             r'src="(.*?/videothek_js.*?\.js)',
1266             0,
1267             [
1268                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1269             ]
1270         )
1271         http_host = url.split('/')[2]
1272         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1273         info = self.grep_webpage(
1274             next_url,
1275             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1276                 '(http://.*?\.swf).*?' +
1277                 '(rtmp://.*?)\'',
1278             re.DOTALL,
1279             [
1280                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1281                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1282                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1283             ]
1284         )
1285         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1286
1287     def extractPlus7Stream(self, url):
1288         video_lang = url.split('/')[-3]
1289         info = self.grep_webpage(
1290             url,
1291             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1292             0,
1293             [
1294                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1295             ]
1296         )
1297         next_url = compat_urllib_parse.unquote(info.get('url'))
1298         info = self.grep_webpage(
1299             next_url,
1300             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1301             0,
1302             [
1303                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1304             ]
1305         )
1306         next_url = compat_urllib_parse.unquote(info.get('url'))
1307
1308         info = self.grep_webpage(
1309             next_url,
1310             r'<video id="(.*?)".*?>.*?' +
1311                 '<name>(.*?)</name>.*?' +
1312                 '<dateVideo>(.*?)</dateVideo>.*?' +
1313                 '<url quality="hd">(.*?)</url>',
1314             re.DOTALL,
1315             [
1316                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1317                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1318                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1319                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1320             ]
1321         )
1322
1323         return {
1324             'id':           info.get('id'),
1325             'url':          compat_urllib_parse.unquote(info.get('url')),
1326             'uploader':     u'arte.tv',
1327             'upload_date':  info.get('date'),
1328             'title':        info.get('title').decode('utf-8'),
1329             'ext':          u'mp4',
1330             'format':       u'NA',
1331             'player_url':   None,
1332         }
1333
1334     def _real_extract(self, url):
1335         video_id = url.split('/')[-1]
1336         self.report_extraction(video_id)
1337
1338         if re.search(self._LIVE_URL, video_id) is not None:
1339             self.extractLiveStream(url)
1340             return
1341         else:
1342             info = self.extractPlus7Stream(url)
1343
1344         return [info]
1345
1346
1347 class GenericIE(InfoExtractor):
1348     """Generic last-resort information extractor."""
1349
1350     _VALID_URL = r'.*'
1351     IE_NAME = u'generic'
1352
1353     def __init__(self, downloader=None):
1354         InfoExtractor.__init__(self, downloader)
1355
1356     def report_download_webpage(self, video_id):
1357         """Report webpage download."""
1358         if not self._downloader.params.get('test', False):
1359             self._downloader.report_warning(u'Falling back on generic information extractor.')
1360         self.to_screen(u'%s: Downloading webpage' % video_id)
1361
1362     def report_extraction(self, video_id):
1363         """Report information extraction."""
1364         self.to_screen(u'%s: Extracting information' % video_id)
1365
1366     def report_following_redirect(self, new_url):
1367         """Report information extraction."""
1368         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1369
1370     def _test_redirect(self, url):
1371         """Check if it is a redirect, like url shorteners, in case return the new url."""
1372         class HeadRequest(compat_urllib_request.Request):
1373             def get_method(self):
1374                 return "HEAD"
1375
1376         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1377             """
1378             Subclass the HTTPRedirectHandler to make it use our
1379             HeadRequest also on the redirected URL
1380             """
1381             def redirect_request(self, req, fp, code, msg, headers, newurl):
1382                 if code in (301, 302, 303, 307):
1383                     newurl = newurl.replace(' ', '%20')
1384                     newheaders = dict((k,v) for k,v in req.headers.items()
1385                                       if k.lower() not in ("content-length", "content-type"))
1386                     return HeadRequest(newurl,
1387                                        headers=newheaders,
1388                                        origin_req_host=req.get_origin_req_host(),
1389                                        unverifiable=True)
1390                 else:
1391                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1392
1393         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1394             """
1395             Fallback to GET if HEAD is not allowed (405 HTTP error)
1396             """
1397             def http_error_405(self, req, fp, code, msg, headers):
1398                 fp.read()
1399                 fp.close()
1400
1401                 newheaders = dict((k,v) for k,v in req.headers.items()
1402                                   if k.lower() not in ("content-length", "content-type"))
1403                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1404                                                  headers=newheaders,
1405                                                  origin_req_host=req.get_origin_req_host(),
1406                                                  unverifiable=True))
1407
1408         # Build our opener
1409         opener = compat_urllib_request.OpenerDirector()
1410         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1411                         HTTPMethodFallback, HEADRedirectHandler,
1412                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1413             opener.add_handler(handler())
1414
1415         response = opener.open(HeadRequest(url))
1416         new_url = response.geturl()
1417
1418         if url == new_url:
1419             return False
1420
1421         self.report_following_redirect(new_url)
1422         return new_url
1423
1424     def _real_extract(self, url):
1425         new_url = self._test_redirect(url)
1426         if new_url: return [self.url_result(new_url)]
1427
1428         video_id = url.split('/')[-1]
1429         try:
1430             webpage = self._download_webpage(url, video_id)
1431         except ValueError as err:
1432             # since this is the last-resort InfoExtractor, if
1433             # this error is thrown, it'll be thrown here
1434             self._downloader.report_error(u'Invalid URL: %s' % url)
1435             return
1436
1437         self.report_extraction(video_id)
1438         # Start with something easy: JW Player in SWFObject
1439         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1440         if mobj is None:
1441             # Broaden the search a little bit
1442             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1443         if mobj is None:
1444             # Broaden the search a little bit: JWPlayer JS loader
1445             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1446         if mobj is None:
1447             self._downloader.report_error(u'Invalid URL: %s' % url)
1448             return
1449
1450         # It's possible that one of the regexes
1451         # matched, but returned an empty group:
1452         if mobj.group(1) is None:
1453             self._downloader.report_error(u'Invalid URL: %s' % url)
1454             return
1455
1456         video_url = compat_urllib_parse.unquote(mobj.group(1))
1457         video_id = os.path.basename(video_url)
1458
1459         # here's a fun little line of code for you:
1460         video_extension = os.path.splitext(video_id)[1][1:]
1461         video_id = os.path.splitext(video_id)[0]
1462
1463         # it's tempting to parse this further, but you would
1464         # have to take into account all the variations like
1465         #   Video Title - Site Name
1466         #   Site Name | Video Title
1467         #   Video Title - Tagline | Site Name
1468         # and so on and so forth; it's just not practical
1469         mobj = re.search(r'<title>(.*)</title>', webpage)
1470         if mobj is None:
1471             self._downloader.report_error(u'unable to extract title')
1472             return
1473         video_title = mobj.group(1)
1474
1475         # video uploader is domain name
1476         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1477         if mobj is None:
1478             self._downloader.report_error(u'unable to extract title')
1479             return
1480         video_uploader = mobj.group(1)
1481
1482         return [{
1483             'id':       video_id,
1484             'url':      video_url,
1485             'uploader': video_uploader,
1486             'upload_date':  None,
1487             'title':    video_title,
1488             'ext':      video_extension,
1489         }]
1490
1491
1492 class YoutubeSearchIE(InfoExtractor):
1493     """Information Extractor for YouTube search queries."""
1494     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1495     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1496     _max_youtube_results = 1000
1497     IE_NAME = u'youtube:search'
1498
1499     def __init__(self, downloader=None):
1500         InfoExtractor.__init__(self, downloader)
1501
1502     def report_download_page(self, query, pagenum):
1503         """Report attempt to download search page with given number."""
1504         query = query.decode(preferredencoding())
1505         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1506
1507     def _real_extract(self, query):
1508         mobj = re.match(self._VALID_URL, query)
1509         if mobj is None:
1510             self._downloader.report_error(u'invalid search query "%s"' % query)
1511             return
1512
1513         prefix, query = query.split(':')
1514         prefix = prefix[8:]
1515         query = query.encode('utf-8')
1516         if prefix == '':
1517             return self._get_n_results(query, 1)
1518         elif prefix == 'all':
1519             self._get_n_results(query, self._max_youtube_results)
1520         else:
1521             try:
1522                 n = int(prefix)
1523                 if n <= 0:
1524                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1525                     return
1526                 elif n > self._max_youtube_results:
1527                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1528                     n = self._max_youtube_results
1529                 return self._get_n_results(query, n)
1530             except ValueError: # parsing prefix as integer fails
1531                 return self._get_n_results(query, 1)
1532
1533     def _get_n_results(self, query, n):
1534         """Get a specified number of results for a query"""
1535
1536         video_ids = []
1537         pagenum = 0
1538         limit = n
1539
1540         while (50 * pagenum) < limit:
1541             self.report_download_page(query, pagenum+1)
1542             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1548                 return
1549             api_response = json.loads(data)['data']
1550
1551             if not 'items' in api_response:
1552                 self._downloader.trouble(u'[youtube] No video results')
1553                 return
1554
1555             new_ids = list(video['id'] for video in api_response['items'])
1556             video_ids += new_ids
1557
1558             limit = min(n, api_response['totalItems'])
1559             pagenum += 1
1560
1561         if len(video_ids) > n:
1562             video_ids = video_ids[:n]
1563         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1564         return videos
1565
1566
1567 class GoogleSearchIE(InfoExtractor):
1568     """Information Extractor for Google Video search queries."""
1569     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1570     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1571     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1572     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1573     _max_google_results = 1000
1574     IE_NAME = u'video.google:search'
1575
1576     def __init__(self, downloader=None):
1577         InfoExtractor.__init__(self, downloader)
1578
1579     def report_download_page(self, query, pagenum):
1580         """Report attempt to download playlist page with given number."""
1581         query = query.decode(preferredencoding())
1582         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1583
1584     def _real_extract(self, query):
1585         mobj = re.match(self._VALID_URL, query)
1586         if mobj is None:
1587             self._downloader.report_error(u'invalid search query "%s"' % query)
1588             return
1589
1590         prefix, query = query.split(':')
1591         prefix = prefix[8:]
1592         query = query.encode('utf-8')
1593         if prefix == '':
1594             self._download_n_results(query, 1)
1595             return
1596         elif prefix == 'all':
1597             self._download_n_results(query, self._max_google_results)
1598             return
1599         else:
1600             try:
1601                 n = int(prefix)
1602                 if n <= 0:
1603                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1604                     return
1605                 elif n > self._max_google_results:
1606                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1607                     n = self._max_google_results
1608                 self._download_n_results(query, n)
1609                 return
1610             except ValueError: # parsing prefix as integer fails
1611                 self._download_n_results(query, 1)
1612                 return
1613
1614     def _download_n_results(self, query, n):
1615         """Downloads a specified number of results for a query"""
1616
1617         video_ids = []
1618         pagenum = 0
1619
1620         while True:
1621             self.report_download_page(query, pagenum)
1622             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1623             request = compat_urllib_request.Request(result_url)
1624             try:
1625                 page = compat_urllib_request.urlopen(request).read()
1626             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1627                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1628                 return
1629
1630             # Extract video identifiers
1631             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1632                 video_id = mobj.group(1)
1633                 if video_id not in video_ids:
1634                     video_ids.append(video_id)
1635                     if len(video_ids) == n:
1636                         # Specified n videos reached
1637                         for id in video_ids:
1638                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1639                         return
1640
1641             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1642                 for id in video_ids:
1643                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1644                 return
1645
1646             pagenum = pagenum + 1
1647
1648
1649 class YahooSearchIE(InfoExtractor):
1650     """Information Extractor for Yahoo! Video search queries."""
1651
1652     _WORKING = False
1653     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1654     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1655     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1656     _MORE_PAGES_INDICATOR = r'\s*Next'
1657     _max_yahoo_results = 1000
1658     IE_NAME = u'video.yahoo:search'
1659
1660     def __init__(self, downloader=None):
1661         InfoExtractor.__init__(self, downloader)
1662
1663     def report_download_page(self, query, pagenum):
1664         """Report attempt to download playlist page with given number."""
1665         query = query.decode(preferredencoding())
1666         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1667
1668     def _real_extract(self, query):
1669         mobj = re.match(self._VALID_URL, query)
1670         if mobj is None:
1671             self._downloader.report_error(u'invalid search query "%s"' % query)
1672             return
1673
1674         prefix, query = query.split(':')
1675         prefix = prefix[8:]
1676         query = query.encode('utf-8')
1677         if prefix == '':
1678             self._download_n_results(query, 1)
1679             return
1680         elif prefix == 'all':
1681             self._download_n_results(query, self._max_yahoo_results)
1682             return
1683         else:
1684             try:
1685                 n = int(prefix)
1686                 if n <= 0:
1687                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1688                     return
1689                 elif n > self._max_yahoo_results:
1690                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1691                     n = self._max_yahoo_results
1692                 self._download_n_results(query, n)
1693                 return
1694             except ValueError: # parsing prefix as integer fails
1695                 self._download_n_results(query, 1)
1696                 return
1697
1698     def _download_n_results(self, query, n):
1699         """Downloads a specified number of results for a query"""
1700
1701         video_ids = []
1702         already_seen = set()
1703         pagenum = 1
1704
1705         while True:
1706             self.report_download_page(query, pagenum)
1707             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1708             request = compat_urllib_request.Request(result_url)
1709             try:
1710                 page = compat_urllib_request.urlopen(request).read()
1711             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1713                 return
1714
1715             # Extract video identifiers
1716             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1717                 video_id = mobj.group(1)
1718                 if video_id not in already_seen:
1719                     video_ids.append(video_id)
1720                     already_seen.add(video_id)
1721                     if len(video_ids) == n:
1722                         # Specified n videos reached
1723                         for id in video_ids:
1724                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1725                         return
1726
1727             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1728                 for id in video_ids:
1729                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1730                 return
1731
1732             pagenum = pagenum + 1
1733
1734
1735 class YoutubePlaylistIE(InfoExtractor):
1736     """Information Extractor for YouTube playlists."""
1737
1738     _VALID_URL = r"""(?:
1739                         (?:https?://)?
1740                         (?:\w+\.)?
1741                         youtube\.com/
1742                         (?:
1743                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1744                            \? (?:.*?&)*? (?:p|a|list)=
1745                         |  p/
1746                         )
1747                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1748                         .*
1749                      |
1750                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1751                      )"""
1752     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1753     _MAX_RESULTS = 50
1754     IE_NAME = u'youtube:playlist'
1755
1756     def __init__(self, downloader=None):
1757         InfoExtractor.__init__(self, downloader)
1758
1759     @classmethod
1760     def suitable(cls, url):
1761         """Receives a URL and returns True if suitable for this IE."""
1762         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1763
1764     def report_download_page(self, playlist_id, pagenum):
1765         """Report attempt to download playlist page with given number."""
1766         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1767
1768     def _real_extract(self, url):
1769         # Extract playlist id
1770         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1771         if mobj is None:
1772             self._downloader.report_error(u'invalid url: %s' % url)
1773             return
1774
1775         # Download playlist videos from API
1776         playlist_id = mobj.group(1) or mobj.group(2)
1777         page_num = 1
1778         videos = []
1779
1780         while True:
1781             self.report_download_page(playlist_id, page_num)
1782
1783             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1784             try:
1785                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1786             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1788                 return
1789
1790             try:
1791                 response = json.loads(page)
1792             except ValueError as err:
1793                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1794                 return
1795
1796             if 'feed' not in response:
1797                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1798                 return
1799             if 'entry' not in response['feed']:
1800                 # Number of videos is a multiple of self._MAX_RESULTS
1801                 break
1802
1803             playlist_title = response['feed']['title']['$t']
1804
1805             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1806                         for entry in response['feed']['entry']
1807                         if 'content' in entry ]
1808
1809             if len(response['feed']['entry']) < self._MAX_RESULTS:
1810                 break
1811             page_num += 1
1812
1813         videos = [v[1] for v in sorted(videos)]
1814
1815         url_results = [self.url_result(url, 'Youtube') for url in videos]
1816         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1817
1818
1819 class YoutubeChannelIE(InfoExtractor):
1820     """Information Extractor for YouTube channels."""
1821
1822     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1823     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1824     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1825     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1826     IE_NAME = u'youtube:channel'
1827
1828     def report_download_page(self, channel_id, pagenum):
1829         """Report attempt to download channel page with given number."""
1830         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1831
1832     def extract_videos_from_page(self, page):
1833         ids_in_page = []
1834         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1835             if mobj.group(1) not in ids_in_page:
1836                 ids_in_page.append(mobj.group(1))
1837         return ids_in_page
1838
1839     def _real_extract(self, url):
1840         # Extract channel id
1841         mobj = re.match(self._VALID_URL, url)
1842         if mobj is None:
1843             self._downloader.report_error(u'invalid url: %s' % url)
1844             return
1845
1846         # Download channel page
1847         channel_id = mobj.group(1)
1848         video_ids = []
1849         pagenum = 1
1850
1851         self.report_download_page(channel_id, pagenum)
1852         url = self._TEMPLATE_URL % (channel_id, pagenum)
1853         request = compat_urllib_request.Request(url)
1854         try:
1855             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1856         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1858             return
1859
1860         # Extract video identifiers
1861         ids_in_page = self.extract_videos_from_page(page)
1862         video_ids.extend(ids_in_page)
1863
1864         # Download any subsequent channel pages using the json-based channel_ajax query
1865         if self._MORE_PAGES_INDICATOR in page:
1866             while True:
1867                 pagenum = pagenum + 1
1868
1869                 self.report_download_page(channel_id, pagenum)
1870                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1871                 request = compat_urllib_request.Request(url)
1872                 try:
1873                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1874                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1876                     return
1877
1878                 page = json.loads(page)
1879
1880                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1881                 video_ids.extend(ids_in_page)
1882
1883                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1884                     break
1885
1886         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1887
1888         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1889         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1890         return [self.playlist_result(url_entries, channel_id)]
1891
1892
1893 class YoutubeUserIE(InfoExtractor):
1894     """Information Extractor for YouTube users."""
1895
1896     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1897     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1898     _GDATA_PAGE_SIZE = 50
1899     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1900     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1901     IE_NAME = u'youtube:user'
1902
1903     def __init__(self, downloader=None):
1904         InfoExtractor.__init__(self, downloader)
1905
1906     def report_download_page(self, username, start_index):
1907         """Report attempt to download user page."""
1908         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1909                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1910
1911     def _real_extract(self, url):
1912         # Extract username
1913         mobj = re.match(self._VALID_URL, url)
1914         if mobj is None:
1915             self._downloader.report_error(u'invalid url: %s' % url)
1916             return
1917
1918         username = mobj.group(1)
1919
1920         # Download video ids using YouTube Data API. Result size per
1921         # query is limited (currently to 50 videos) so we need to query
1922         # page by page until there are no video ids - it means we got
1923         # all of them.
1924
1925         video_ids = []
1926         pagenum = 0
1927
1928         while True:
1929             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1930             self.report_download_page(username, start_index)
1931
1932             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1933
1934             try:
1935                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1937                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1938                 return
1939
1940             # Extract video identifiers
1941             ids_in_page = []
1942
1943             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1944                 if mobj.group(1) not in ids_in_page:
1945                     ids_in_page.append(mobj.group(1))
1946
1947             video_ids.extend(ids_in_page)
1948
1949             # A little optimization - if current page is not
1950             # "full", ie. does not contain PAGE_SIZE video ids then
1951             # we can assume that this page is the last one - there
1952             # are no more ids on further pages - no need to query
1953             # again.
1954
1955             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1956                 break
1957
1958             pagenum += 1
1959
1960         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1961         url_results = [self.url_result(url, 'Youtube') for url in urls]
1962         return [self.playlist_result(url_results, playlist_title = username)]
1963
1964
1965 class BlipTVUserIE(InfoExtractor):
1966     """Information Extractor for blip.tv users."""
1967
1968     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1969     _PAGE_SIZE = 12
1970     IE_NAME = u'blip.tv:user'
1971
1972     def __init__(self, downloader=None):
1973         InfoExtractor.__init__(self, downloader)
1974
1975     def report_download_page(self, username, pagenum):
1976         """Report attempt to download user page."""
1977         self.to_screen(u'user %s: Downloading video ids from page %d' %
1978                 (username, pagenum))
1979
1980     def _real_extract(self, url):
1981         # Extract username
1982         mobj = re.match(self._VALID_URL, url)
1983         if mobj is None:
1984             self._downloader.report_error(u'invalid url: %s' % url)
1985             return
1986
1987         username = mobj.group(1)
1988
1989         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1990
1991         request = compat_urllib_request.Request(url)
1992
1993         try:
1994             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1995             mobj = re.search(r'data-users-id="([^"]+)"', page)
1996             page_base = page_base % mobj.group(1)
1997         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1998             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1999             return
2000
2001
2002         # Download video ids using BlipTV Ajax calls. Result size per
2003         # query is limited (currently to 12 videos) so we need to query
2004         # page by page until there are no video ids - it means we got
2005         # all of them.
2006
2007         video_ids = []
2008         pagenum = 1
2009
2010         while True:
2011             self.report_download_page(username, pagenum)
2012             url = page_base + "&page=" + str(pagenum)
2013             request = compat_urllib_request.Request( url )
2014             try:
2015                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2016             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2018                 return
2019
2020             # Extract video identifiers
2021             ids_in_page = []
2022
2023             for mobj in re.finditer(r'href="/([^"]+)"', page):
2024                 if mobj.group(1) not in ids_in_page:
2025                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2026
2027             video_ids.extend(ids_in_page)
2028
2029             # A little optimization - if current page is not
2030             # "full", ie. does not contain PAGE_SIZE video ids then
2031             # we can assume that this page is the last one - there
2032             # are no more ids on further pages - no need to query
2033             # again.
2034
2035             if len(ids_in_page) < self._PAGE_SIZE:
2036                 break
2037
2038             pagenum += 1
2039
2040         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2041         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2042         return [self.playlist_result(url_entries, playlist_title = username)]
2043
2044
2045 class DepositFilesIE(InfoExtractor):
2046     """Information extractor for depositfiles.com"""
2047
2048     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2049
2050     def report_download_webpage(self, file_id):
2051         """Report webpage download."""
2052         self.to_screen(u'%s: Downloading webpage' % file_id)
2053
2054     def report_extraction(self, file_id):
2055         """Report information extraction."""
2056         self.to_screen(u'%s: Extracting information' % file_id)
2057
2058     def _real_extract(self, url):
2059         file_id = url.split('/')[-1]
2060         # Rebuild url in english locale
2061         url = 'http://depositfiles.com/en/files/' + file_id
2062
2063         # Retrieve file webpage with 'Free download' button pressed
2064         free_download_indication = { 'gateway_result' : '1' }
2065         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2066         try:
2067             self.report_download_webpage(file_id)
2068             webpage = compat_urllib_request.urlopen(request).read()
2069         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2071             return
2072
2073         # Search for the real file URL
2074         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2075         if (mobj is None) or (mobj.group(1) is None):
2076             # Try to figure out reason of the error.
2077             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2078             if (mobj is not None) and (mobj.group(1) is not None):
2079                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2080                 self._downloader.report_error(u'%s' % restriction_message)
2081             else:
2082                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2083             return
2084
2085         file_url = mobj.group(1)
2086         file_extension = os.path.splitext(file_url)[1][1:]
2087
2088         # Search for file title
2089         mobj = re.search(r'<b title="(.*?)">', webpage)
2090         if mobj is None:
2091             self._downloader.report_error(u'unable to extract title')
2092             return
2093         file_title = mobj.group(1).decode('utf-8')
2094
2095         return [{
2096             'id':       file_id.decode('utf-8'),
2097             'url':      file_url.decode('utf-8'),
2098             'uploader': None,
2099             'upload_date':  None,
2100             'title':    file_title,
2101             'ext':      file_extension.decode('utf-8'),
2102         }]
2103
2104
2105 class FacebookIE(InfoExtractor):
2106     """Information Extractor for Facebook"""
2107
2108     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2109     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2110     _NETRC_MACHINE = 'facebook'
2111     IE_NAME = u'facebook'
2112
2113     def report_login(self):
2114         """Report attempt to log in."""
2115         self.to_screen(u'Logging in')
2116
2117     def _real_initialize(self):
2118         if self._downloader is None:
2119             return
2120
2121         useremail = None
2122         password = None
2123         downloader_params = self._downloader.params
2124
2125         # Attempt to use provided username and password or .netrc data
2126         if downloader_params.get('username', None) is not None:
2127             useremail = downloader_params['username']
2128             password = downloader_params['password']
2129         elif downloader_params.get('usenetrc', False):
2130             try:
2131                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2132                 if info is not None:
2133                     useremail = info[0]
2134                     password = info[2]
2135                 else:
2136                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2137             except (IOError, netrc.NetrcParseError) as err:
2138                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2139                 return
2140
2141         if useremail is None:
2142             return
2143
2144         # Log in
2145         login_form = {
2146             'email': useremail,
2147             'pass': password,
2148             'login': 'Log+In'
2149             }
2150         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2151         try:
2152             self.report_login()
2153             login_results = compat_urllib_request.urlopen(request).read()
2154             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2155                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2156                 return
2157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2158             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2159             return
2160
2161     def _real_extract(self, url):
2162         mobj = re.match(self._VALID_URL, url)
2163         if mobj is None:
2164             self._downloader.report_error(u'invalid URL: %s' % url)
2165             return
2166         video_id = mobj.group('ID')
2167
2168         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2169         webpage = self._download_webpage(url, video_id)
2170
2171         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2172         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2173         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2174         if not m:
2175             raise ExtractorError(u'Cannot parse data')
2176         data = dict(json.loads(m.group(1)))
2177         params_raw = compat_urllib_parse.unquote(data['params'])
2178         params = json.loads(params_raw)
2179         video_data = params['video_data'][0]
2180         video_url = video_data.get('hd_src')
2181         if not video_url:
2182             video_url = video_data['sd_src']
2183         if not video_url:
2184             raise ExtractorError(u'Cannot find video URL')
2185         video_duration = int(video_data['video_duration'])
2186         thumbnail = video_data['thumbnail_src']
2187
2188         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2189         if not m:
2190             raise ExtractorError(u'Cannot find title in webpage')
2191         video_title = unescapeHTML(m.group(1))
2192
2193         info = {
2194             'id': video_id,
2195             'title': video_title,
2196             'url': video_url,
2197             'ext': 'mp4',
2198             'duration': video_duration,
2199             'thumbnail': thumbnail,
2200         }
2201         return [info]
2202
2203
2204 class BlipTVIE(InfoExtractor):
2205     """Information extractor for blip.tv"""
2206
2207     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2208     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2209     IE_NAME = u'blip.tv'
2210
2211     def report_extraction(self, file_id):
2212         """Report information extraction."""
2213         self.to_screen(u'%s: Extracting information' % file_id)
2214
2215     def report_direct_download(self, title):
2216         """Report information extraction."""
2217         self.to_screen(u'%s: Direct download detected' % title)
2218
2219     def _real_extract(self, url):
2220         mobj = re.match(self._VALID_URL, url)
2221         if mobj is None:
2222             self._downloader.report_error(u'invalid URL: %s' % url)
2223             return
2224
2225         urlp = compat_urllib_parse_urlparse(url)
2226         if urlp.path.startswith('/play/'):
2227             request = compat_urllib_request.Request(url)
2228             response = compat_urllib_request.urlopen(request)
2229             redirecturl = response.geturl()
2230             rurlp = compat_urllib_parse_urlparse(redirecturl)
2231             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2232             url = 'http://blip.tv/a/a-' + file_id
2233             return self._real_extract(url)
2234
2235
2236         if '?' in url:
2237             cchar = '&'
2238         else:
2239             cchar = '?'
2240         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2241         request = compat_urllib_request.Request(json_url)
2242         request.add_header('User-Agent', 'iTunes/10.6.1')
2243         self.report_extraction(mobj.group(1))
2244         info = None
2245         try:
2246             urlh = compat_urllib_request.urlopen(request)
2247             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2248                 basename = url.split('/')[-1]
2249                 title,ext = os.path.splitext(basename)
2250                 title = title.decode('UTF-8')
2251                 ext = ext.replace('.', '')
2252                 self.report_direct_download(title)
2253                 info = {
2254                     'id': title,
2255                     'url': url,
2256                     'uploader': None,
2257                     'upload_date': None,
2258                     'title': title,
2259                     'ext': ext,
2260                     'urlhandle': urlh
2261                 }
2262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2263             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2264         if info is None: # Regular URL
2265             try:
2266                 json_code_bytes = urlh.read()
2267                 json_code = json_code_bytes.decode('utf-8')
2268             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2269                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2270                 return
2271
2272             try:
2273                 json_data = json.loads(json_code)
2274                 if 'Post' in json_data:
2275                     data = json_data['Post']
2276                 else:
2277                     data = json_data
2278
2279                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2280                 video_url = data['media']['url']
2281                 umobj = re.match(self._URL_EXT, video_url)
2282                 if umobj is None:
2283                     raise ValueError('Can not determine filename extension')
2284                 ext = umobj.group(1)
2285
2286                 info = {
2287                     'id': data['item_id'],
2288                     'url': video_url,
2289                     'uploader': data['display_name'],
2290                     'upload_date': upload_date,
2291                     'title': data['title'],
2292                     'ext': ext,
2293                     'format': data['media']['mimeType'],
2294                     'thumbnail': data['thumbnailUrl'],
2295                     'description': data['description'],
2296                     'player_url': data['embedUrl'],
2297                     'user_agent': 'iTunes/10.6.1',
2298                 }
2299             except (ValueError,KeyError) as err:
2300                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2301                 return
2302
2303         return [info]
2304
2305
2306 class MyVideoIE(InfoExtractor):
2307     """Information Extractor for myvideo.de."""
2308
2309     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2310     IE_NAME = u'myvideo'
2311
2312     def __init__(self, downloader=None):
2313         InfoExtractor.__init__(self, downloader)
2314
2315     def report_extraction(self, video_id):
2316         """Report information extraction."""
2317         self.to_screen(u'%s: Extracting information' % video_id)
2318
2319     def _real_extract(self,url):
2320         mobj = re.match(self._VALID_URL, url)
2321         if mobj is None:
2322             self._download.report_error(u'invalid URL: %s' % url)
2323             return
2324
2325         video_id = mobj.group(1)
2326
2327         # Get video webpage
2328         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2329         webpage = self._download_webpage(webpage_url, video_id)
2330
2331         self.report_extraction(video_id)
2332         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2333                  webpage)
2334         if mobj is None:
2335             self._downloader.report_error(u'unable to extract media URL')
2336             return
2337         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2338
2339         mobj = re.search('<title>([^<]+)</title>', webpage)
2340         if mobj is None:
2341             self._downloader.report_error(u'unable to extract title')
2342             return
2343
2344         video_title = mobj.group(1)
2345
2346         return [{
2347             'id':       video_id,
2348             'url':      video_url,
2349             'uploader': None,
2350             'upload_date':  None,
2351             'title':    video_title,
2352             'ext':      u'flv',
2353         }]
2354
2355 class ComedyCentralIE(InfoExtractor):
2356     """Information extractor for The Daily Show and Colbert Report """
2357
2358     # urls can be abbreviations like :thedailyshow or :colbert
2359     # urls for episodes like:
2360     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2361     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2362     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2363     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2364                       |(https?://)?(www\.)?
2365                           (?P<showname>thedailyshow|colbertnation)\.com/
2366                          (full-episodes/(?P<episode>.*)|
2367                           (?P<clip>
2368                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2369                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2370                      $"""
2371
2372     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2373
2374     _video_extensions = {
2375         '3500': 'mp4',
2376         '2200': 'mp4',
2377         '1700': 'mp4',
2378         '1200': 'mp4',
2379         '750': 'mp4',
2380         '400': 'mp4',
2381     }
2382     _video_dimensions = {
2383         '3500': '1280x720',
2384         '2200': '960x540',
2385         '1700': '768x432',
2386         '1200': '640x360',
2387         '750': '512x288',
2388         '400': '384x216',
2389     }
2390
2391     @classmethod
2392     def suitable(cls, url):
2393         """Receives a URL and returns True if suitable for this IE."""
2394         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2395
2396     def report_extraction(self, episode_id):
2397         self.to_screen(u'%s: Extracting information' % episode_id)
2398
2399     def report_config_download(self, episode_id, media_id):
2400         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2401
2402     def report_index_download(self, episode_id):
2403         self.to_screen(u'%s: Downloading show index' % episode_id)
2404
2405     def _print_formats(self, formats):
2406         print('Available formats:')
2407         for x in formats:
2408             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2409
2410
2411     def _real_extract(self, url):
2412         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2413         if mobj is None:
2414             self._downloader.report_error(u'invalid URL: %s' % url)
2415             return
2416
2417         if mobj.group('shortname'):
2418             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2419                 url = u'http://www.thedailyshow.com/full-episodes/'
2420             else:
2421                 url = u'http://www.colbertnation.com/full-episodes/'
2422             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2423             assert mobj is not None
2424
2425         if mobj.group('clip'):
2426             if mobj.group('showname') == 'thedailyshow':
2427                 epTitle = mobj.group('tdstitle')
2428             else:
2429                 epTitle = mobj.group('cntitle')
2430             dlNewest = False
2431         else:
2432             dlNewest = not mobj.group('episode')
2433             if dlNewest:
2434                 epTitle = mobj.group('showname')
2435             else:
2436                 epTitle = mobj.group('episode')
2437
2438         req = compat_urllib_request.Request(url)
2439         self.report_extraction(epTitle)
2440         try:
2441             htmlHandle = compat_urllib_request.urlopen(req)
2442             html = htmlHandle.read()
2443             webpage = html.decode('utf-8')
2444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2445             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2446             return
2447         if dlNewest:
2448             url = htmlHandle.geturl()
2449             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2450             if mobj is None:
2451                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2452                 return
2453             if mobj.group('episode') == '':
2454                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2455                 return
2456             epTitle = mobj.group('episode')
2457
2458         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2459
2460         if len(mMovieParams) == 0:
2461             # The Colbert Report embeds the information in a without
2462             # a URL prefix; so extract the alternate reference
2463             # and then add the URL prefix manually.
2464
2465             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2466             if len(altMovieParams) == 0:
2467                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2468                 return
2469             else:
2470                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2471
2472         uri = mMovieParams[0][1]
2473         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2474         self.report_index_download(epTitle)
2475         try:
2476             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2477         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2478             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2479             return
2480
2481         results = []
2482
2483         idoc = xml.etree.ElementTree.fromstring(indexXml)
2484         itemEls = idoc.findall('.//item')
2485         for partNum,itemEl in enumerate(itemEls):
2486             mediaId = itemEl.findall('./guid')[0].text
2487             shortMediaId = mediaId.split(':')[-1]
2488             showId = mediaId.split(':')[-2].replace('.com', '')
2489             officialTitle = itemEl.findall('./title')[0].text
2490             officialDate = itemEl.findall('./pubDate')[0].text
2491
2492             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2493                         compat_urllib_parse.urlencode({'uri': mediaId}))
2494             configReq = compat_urllib_request.Request(configUrl)
2495             self.report_config_download(epTitle, shortMediaId)
2496             try:
2497                 configXml = compat_urllib_request.urlopen(configReq).read()
2498             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2499                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2500                 return
2501
2502             cdoc = xml.etree.ElementTree.fromstring(configXml)
2503             turls = []
2504             for rendition in cdoc.findall('.//rendition'):
2505                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2506                 turls.append(finfo)
2507
2508             if len(turls) == 0:
2509                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2510                 continue
2511
2512             if self._downloader.params.get('listformats', None):
2513                 self._print_formats([i[0] for i in turls])
2514                 return
2515
2516             # For now, just pick the highest bitrate
2517             format,rtmp_video_url = turls[-1]
2518
2519             # Get the format arg from the arg stream
2520             req_format = self._downloader.params.get('format', None)
2521
2522             # Select format if we can find one
2523             for f,v in turls:
2524                 if f == req_format:
2525                     format, rtmp_video_url = f, v
2526                     break
2527
2528             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2529             if not m:
2530                 raise ExtractorError(u'Cannot transform RTMP url')
2531             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2532             video_url = base + m.group('finalid')
2533
2534             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2535             info = {
2536                 'id': shortMediaId,
2537                 'url': video_url,
2538                 'uploader': showId,
2539                 'upload_date': officialDate,
2540                 'title': effTitle,
2541                 'ext': 'mp4',
2542                 'format': format,
2543                 'thumbnail': None,
2544                 'description': officialTitle,
2545             }
2546             results.append(info)
2547
2548         return results
2549
2550
2551 class EscapistIE(InfoExtractor):
2552     """Information extractor for The Escapist """
2553
2554     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2555     IE_NAME = u'escapist'
2556
2557     def report_extraction(self, showName):
2558         self.to_screen(u'%s: Extracting information' % showName)
2559
2560     def report_config_download(self, showName):
2561         self.to_screen(u'%s: Downloading configuration' % showName)
2562
2563     def _real_extract(self, url):
2564         mobj = re.match(self._VALID_URL, url)
2565         if mobj is None:
2566             self._downloader.report_error(u'invalid URL: %s' % url)
2567             return
2568         showName = mobj.group('showname')
2569         videoId = mobj.group('episode')
2570
2571         self.report_extraction(showName)
2572         try:
2573             webPage = compat_urllib_request.urlopen(url)
2574             webPageBytes = webPage.read()
2575             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2576             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2577         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2578             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2579             return
2580
2581         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2582         description = unescapeHTML(descMatch.group(1))
2583         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2584         imgUrl = unescapeHTML(imgMatch.group(1))
2585         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2586         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2587         configUrlMatch = re.search('config=(.*)$', playerUrl)
2588         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2589
2590         self.report_config_download(showName)
2591         try:
2592             configJSON = compat_urllib_request.urlopen(configUrl)
2593             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2594             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2595         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2596             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2597             return
2598
2599         # Technically, it's JavaScript, not JSON
2600         configJSON = configJSON.replace("'", '"')
2601
2602         try:
2603             config = json.loads(configJSON)
2604         except (ValueError,) as err:
2605             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2606             return
2607
2608         playlist = config['playlist']
2609         videoUrl = playlist[1]['url']
2610
2611         info = {
2612             'id': videoId,
2613             'url': videoUrl,
2614             'uploader': showName,
2615             'upload_date': None,
2616             'title': showName,
2617             'ext': 'mp4',
2618             'thumbnail': imgUrl,
2619             'description': description,
2620             'player_url': playerUrl,
2621         }
2622
2623         return [info]
2624
2625 class CollegeHumorIE(InfoExtractor):
2626     """Information extractor for collegehumor.com"""
2627
2628     _WORKING = False
2629     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2630     IE_NAME = u'collegehumor'
2631
2632     def report_manifest(self, video_id):
2633         """Report information extraction."""
2634         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2635
2636     def report_extraction(self, video_id):
2637         """Report information extraction."""
2638         self.to_screen(u'%s: Extracting information' % video_id)
2639
2640     def _real_extract(self, url):
2641         mobj = re.match(self._VALID_URL, url)
2642         if mobj is None:
2643             self._downloader.report_error(u'invalid URL: %s' % url)
2644             return
2645         video_id = mobj.group('videoid')
2646
2647         info = {
2648             'id': video_id,
2649             'uploader': None,
2650             'upload_date': None,
2651         }
2652
2653         self.report_extraction(video_id)
2654         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2655         try:
2656             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2657         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2659             return
2660
2661         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2662         try:
2663             videoNode = mdoc.findall('./video')[0]
2664             info['description'] = videoNode.findall('./description')[0].text
2665             info['title'] = videoNode.findall('./caption')[0].text
2666             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2667             manifest_url = videoNode.findall('./file')[0].text
2668         except IndexError:
2669             self._downloader.report_error(u'Invalid metadata XML file')
2670             return
2671
2672         manifest_url += '?hdcore=2.10.3'
2673         self.report_manifest(video_id)
2674         try:
2675             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2676         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2677             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2678             return
2679
2680         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2681         try:
2682             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2683             node_id = media_node.attrib['url']
2684             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2685         except IndexError as err:
2686             self._downloader.report_error(u'Invalid manifest file')
2687             return
2688
2689         url_pr = compat_urllib_parse_urlparse(manifest_url)
2690         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2691
2692         info['url'] = url
2693         info['ext'] = 'f4f'
2694         return [info]
2695
2696
2697 class XVideosIE(InfoExtractor):
2698     """Information extractor for xvideos.com"""
2699
2700     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2701     IE_NAME = u'xvideos'
2702
2703     def report_extraction(self, video_id):
2704         """Report information extraction."""
2705         self.to_screen(u'%s: Extracting information' % video_id)
2706
2707     def _real_extract(self, url):
2708         mobj = re.match(self._VALID_URL, url)
2709         if mobj is None:
2710             self._downloader.report_error(u'invalid URL: %s' % url)
2711             return
2712         video_id = mobj.group(1)
2713
2714         webpage = self._download_webpage(url, video_id)
2715
2716         self.report_extraction(video_id)
2717
2718
2719         # Extract video URL
2720         mobj = re.search(r'flv_url=(.+?)&', webpage)
2721         if mobj is None:
2722             self._downloader.report_error(u'unable to extract video url')
2723             return
2724         video_url = compat_urllib_parse.unquote(mobj.group(1))
2725
2726
2727         # Extract title
2728         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2729         if mobj is None:
2730             self._downloader.report_error(u'unable to extract video title')
2731             return
2732         video_title = mobj.group(1)
2733
2734
2735         # Extract video thumbnail
2736         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2737         if mobj is None:
2738             self._downloader.report_error(u'unable to extract video thumbnail')
2739             return
2740         video_thumbnail = mobj.group(0)
2741
2742         info = {
2743             'id': video_id,
2744             'url': video_url,
2745             'uploader': None,
2746             'upload_date': None,
2747             'title': video_title,
2748             'ext': 'flv',
2749             'thumbnail': video_thumbnail,
2750             'description': None,
2751         }
2752
2753         return [info]
2754
2755
2756 class SoundcloudIE(InfoExtractor):
2757     """Information extractor for soundcloud.com
2758        To access the media, the uid of the song and a stream token
2759        must be extracted from the page source and the script must make
2760        a request to media.soundcloud.com/crossdomain.xml. Then
2761        the media can be grabbed by requesting from an url composed
2762        of the stream token and uid
2763      """
2764
2765     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2766     IE_NAME = u'soundcloud'
2767
2768     def __init__(self, downloader=None):
2769         InfoExtractor.__init__(self, downloader)
2770
2771     def report_resolve(self, video_id):
2772         """Report information extraction."""
2773         self.to_screen(u'%s: Resolving id' % video_id)
2774
2775     def report_extraction(self, video_id):
2776         """Report information extraction."""
2777         self.to_screen(u'%s: Retrieving stream' % video_id)
2778
2779     def _real_extract(self, url):
2780         mobj = re.match(self._VALID_URL, url)
2781         if mobj is None:
2782             self._downloader.report_error(u'invalid URL: %s' % url)
2783             return
2784
2785         # extract uploader (which is in the url)
2786         uploader = mobj.group(1)
2787         # extract simple title (uploader + slug of song title)
2788         slug_title =  mobj.group(2)
2789         simple_title = uploader + u'-' + slug_title
2790
2791         self.report_resolve('%s/%s' % (uploader, slug_title))
2792
2793         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2794         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2795         request = compat_urllib_request.Request(resolv_url)
2796         try:
2797             info_json_bytes = compat_urllib_request.urlopen(request).read()
2798             info_json = info_json_bytes.decode('utf-8')
2799         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2800             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2801             return
2802
2803         info = json.loads(info_json)
2804         video_id = info['id']
2805         self.report_extraction('%s/%s' % (uploader, slug_title))
2806
2807         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2808         request = compat_urllib_request.Request(streams_url)
2809         try:
2810             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2811             stream_json = stream_json_bytes.decode('utf-8')
2812         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2814             return
2815
2816         streams = json.loads(stream_json)
2817         mediaURL = streams['http_mp3_128_url']
2818
2819         return [{
2820             'id':       info['id'],
2821             'url':      mediaURL,
2822             'uploader': info['user']['username'],
2823             'upload_date':  info['created_at'],
2824             'title':    info['title'],
2825             'ext':      u'mp3',
2826             'description': info['description'],
2827         }]
2828
2829 class SoundcloudSetIE(InfoExtractor):
2830     """Information extractor for soundcloud.com sets
2831        To access the media, the uid of the song and a stream token
2832        must be extracted from the page source and the script must make
2833        a request to media.soundcloud.com/crossdomain.xml. Then
2834        the media can be grabbed by requesting from an url composed
2835        of the stream token and uid
2836      """
2837
2838     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2839     IE_NAME = u'soundcloud'
2840
2841     def __init__(self, downloader=None):
2842         InfoExtractor.__init__(self, downloader)
2843
2844     def report_resolve(self, video_id):
2845         """Report information extraction."""
2846         self.to_screen(u'%s: Resolving id' % video_id)
2847
2848     def report_extraction(self, video_id):
2849         """Report information extraction."""
2850         self.to_screen(u'%s: Retrieving stream' % video_id)
2851
2852     def _real_extract(self, url):
2853         mobj = re.match(self._VALID_URL, url)
2854         if mobj is None:
2855             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2856             return
2857
2858         # extract uploader (which is in the url)
2859         uploader = mobj.group(1)
2860         # extract simple title (uploader + slug of song title)
2861         slug_title =  mobj.group(2)
2862         simple_title = uploader + u'-' + slug_title
2863
2864         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2865
2866         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2867         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2868         request = compat_urllib_request.Request(resolv_url)
2869         try:
2870             info_json_bytes = compat_urllib_request.urlopen(request).read()
2871             info_json = info_json_bytes.decode('utf-8')
2872         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2873             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2874             return
2875
2876         videos = []
2877         info = json.loads(info_json)
2878         if 'errors' in info:
2879             for err in info['errors']:
2880                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2881             return
2882
2883         for track in info['tracks']:
2884             video_id = track['id']
2885             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2886
2887             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2888             request = compat_urllib_request.Request(streams_url)
2889             try:
2890                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2891                 stream_json = stream_json_bytes.decode('utf-8')
2892             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2894                 return
2895
2896             streams = json.loads(stream_json)
2897             mediaURL = streams['http_mp3_128_url']
2898
2899             videos.append({
2900                 'id':       video_id,
2901                 'url':      mediaURL,
2902                 'uploader': track['user']['username'],
2903                 'upload_date':  track['created_at'],
2904                 'title':    track['title'],
2905                 'ext':      u'mp3',
2906                 'description': track['description'],
2907             })
2908         return videos
2909
2910
2911 class InfoQIE(InfoExtractor):
2912     """Information extractor for infoq.com"""
2913     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2914
2915     def report_extraction(self, video_id):
2916         """Report information extraction."""
2917         self.to_screen(u'%s: Extracting information' % video_id)
2918
2919     def _real_extract(self, url):
2920         mobj = re.match(self._VALID_URL, url)
2921         if mobj is None:
2922             self._downloader.report_error(u'invalid URL: %s' % url)
2923             return
2924
2925         webpage = self._download_webpage(url, video_id=url)
2926         self.report_extraction(url)
2927
2928         # Extract video URL
2929         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2930         if mobj is None:
2931             self._downloader.report_error(u'unable to extract video url')
2932             return
2933         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2934         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2935
2936         # Extract title
2937         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2938         if mobj is None:
2939             self._downloader.report_error(u'unable to extract video title')
2940             return
2941         video_title = mobj.group(1)
2942
2943         # Extract description
2944         video_description = u'No description available.'
2945         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2946         if mobj is not None:
2947             video_description = mobj.group(1)
2948
2949         video_filename = video_url.split('/')[-1]
2950         video_id, extension = video_filename.split('.')
2951
2952         info = {
2953             'id': video_id,
2954             'url': video_url,
2955             'uploader': None,
2956             'upload_date': None,
2957             'title': video_title,
2958             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2959             'thumbnail': None,
2960             'description': video_description,
2961         }
2962
2963         return [info]
2964
2965 class MixcloudIE(InfoExtractor):
2966     """Information extractor for www.mixcloud.com"""
2967
2968     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2969     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2970     IE_NAME = u'mixcloud'
2971
2972     def __init__(self, downloader=None):
2973         InfoExtractor.__init__(self, downloader)
2974
2975     def report_download_json(self, file_id):
2976         """Report JSON download."""
2977         self.to_screen(u'Downloading json')
2978
2979     def report_extraction(self, file_id):
2980         """Report information extraction."""
2981         self.to_screen(u'%s: Extracting information' % file_id)
2982
2983     def get_urls(self, jsonData, fmt, bitrate='best'):
2984         """Get urls from 'audio_formats' section in json"""
2985         file_url = None
2986         try:
2987             bitrate_list = jsonData[fmt]
2988             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2989                 bitrate = max(bitrate_list) # select highest
2990
2991             url_list = jsonData[fmt][bitrate]
2992         except TypeError: # we have no bitrate info.
2993             url_list = jsonData[fmt]
2994         return url_list
2995
2996     def check_urls(self, url_list):
2997         """Returns 1st active url from list"""
2998         for url in url_list:
2999             try:
3000                 compat_urllib_request.urlopen(url)
3001                 return url
3002             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3003                 url = None
3004
3005         return None
3006
3007     def _print_formats(self, formats):
3008         print('Available formats:')
3009         for fmt in formats.keys():
3010             for b in formats[fmt]:
3011                 try:
3012                     ext = formats[fmt][b][0]
3013                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3014                 except TypeError: # we have no bitrate info
3015                     ext = formats[fmt][0]
3016                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3017                     break
3018
3019     def _real_extract(self, url):
3020         mobj = re.match(self._VALID_URL, url)
3021         if mobj is None:
3022             self._downloader.report_error(u'invalid URL: %s' % url)
3023             return
3024         # extract uploader & filename from url
3025         uploader = mobj.group(1).decode('utf-8')
3026         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3027
3028         # construct API request
3029         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3030         # retrieve .json file with links to files
3031         request = compat_urllib_request.Request(file_url)
3032         try:
3033             self.report_download_json(file_url)
3034             jsonData = compat_urllib_request.urlopen(request).read()
3035         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3036             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3037             return
3038
3039         # parse JSON
3040         json_data = json.loads(jsonData)
3041         player_url = json_data['player_swf_url']
3042         formats = dict(json_data['audio_formats'])
3043
3044         req_format = self._downloader.params.get('format', None)
3045         bitrate = None
3046
3047         if self._downloader.params.get('listformats', None):
3048             self._print_formats(formats)
3049             return
3050
3051         if req_format is None or req_format == 'best':
3052             for format_param in formats.keys():
3053                 url_list = self.get_urls(formats, format_param)
3054                 # check urls
3055                 file_url = self.check_urls(url_list)
3056                 if file_url is not None:
3057                     break # got it!
3058         else:
3059             if req_format not in formats:
3060                 self._downloader.report_error(u'format is not available')
3061                 return
3062
3063             url_list = self.get_urls(formats, req_format)
3064             file_url = self.check_urls(url_list)
3065             format_param = req_format
3066
3067         return [{
3068             'id': file_id.decode('utf-8'),
3069             'url': file_url.decode('utf-8'),
3070             'uploader': uploader.decode('utf-8'),
3071             'upload_date': None,
3072             'title': json_data['name'],
3073             'ext': file_url.split('.')[-1].decode('utf-8'),
3074             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3075             'thumbnail': json_data['thumbnail_url'],
3076             'description': json_data['description'],
3077             'player_url': player_url.decode('utf-8'),
3078         }]
3079
3080 class StanfordOpenClassroomIE(InfoExtractor):
3081     """Information extractor for Stanford's Open ClassRoom"""
3082
3083     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3084     IE_NAME = u'stanfordoc'
3085
3086     def report_download_webpage(self, objid):
3087         """Report information extraction."""
3088         self.to_screen(u'%s: Downloading webpage' % objid)
3089
3090     def report_extraction(self, video_id):
3091         """Report information extraction."""
3092         self.to_screen(u'%s: Extracting information' % video_id)
3093
3094     def _real_extract(self, url):
3095         mobj = re.match(self._VALID_URL, url)
3096         if mobj is None:
3097             raise ExtractorError(u'Invalid URL: %s' % url)
3098
3099         if mobj.group('course') and mobj.group('video'): # A specific video
3100             course = mobj.group('course')
3101             video = mobj.group('video')
3102             info = {
3103                 'id': course + '_' + video,
3104                 'uploader': None,
3105                 'upload_date': None,
3106             }
3107
3108             self.report_extraction(info['id'])
3109             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3110             xmlUrl = baseUrl + video + '.xml'
3111             try:
3112                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3113             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3114                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3115                 return
3116             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3117             try:
3118                 info['title'] = mdoc.findall('./title')[0].text
3119                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3120             except IndexError:
3121                 self._downloader.report_error(u'Invalid metadata XML file')
3122                 return
3123             info['ext'] = info['url'].rpartition('.')[2]
3124             return [info]
3125         elif mobj.group('course'): # A course page
3126             course = mobj.group('course')
3127             info = {
3128                 'id': course,
3129                 'type': 'playlist',
3130                 'uploader': None,
3131                 'upload_date': None,
3132             }
3133
3134             coursepage = self._download_webpage(url, info['id'],
3135                                         note='Downloading course info page',
3136                                         errnote='Unable to download course info page')
3137
3138             m = re.search('<h1>([^<]+)</h1>', coursepage)
3139             if m:
3140                 info['title'] = unescapeHTML(m.group(1))
3141             else:
3142                 info['title'] = info['id']
3143
3144             m = re.search('<description>([^<]+)</description>', coursepage)
3145             if m:
3146                 info['description'] = unescapeHTML(m.group(1))
3147
3148             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3149             info['list'] = [
3150                 {
3151                     'type': 'reference',
3152                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3153                 }
3154                     for vpage in links]
3155             results = []
3156             for entry in info['list']:
3157                 assert entry['type'] == 'reference'
3158                 results += self.extract(entry['url'])
3159             return results
3160         else: # Root page
3161             info = {
3162                 'id': 'Stanford OpenClassroom',
3163                 'type': 'playlist',
3164                 'uploader': None,
3165                 'upload_date': None,
3166             }
3167
3168             self.report_download_webpage(info['id'])
3169             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3170             try:
3171                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3172             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3173                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3174                 return
3175
3176             info['title'] = info['id']
3177
3178             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3179             info['list'] = [
3180                 {
3181                     'type': 'reference',
3182                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3183                 }
3184                     for cpage in links]
3185
3186             results = []
3187             for entry in info['list']:
3188                 assert entry['type'] == 'reference'
3189                 results += self.extract(entry['url'])
3190             return results
3191
3192 class MTVIE(InfoExtractor):
3193     """Information extractor for MTV.com"""
3194
3195     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3196     IE_NAME = u'mtv'
3197
3198     def report_extraction(self, video_id):
3199         """Report information extraction."""
3200         self.to_screen(u'%s: Extracting information' % video_id)
3201
3202     def _real_extract(self, url):
3203         mobj = re.match(self._VALID_URL, url)
3204         if mobj is None:
3205             self._downloader.report_error(u'invalid URL: %s' % url)
3206             return
3207         if not mobj.group('proto'):
3208             url = 'http://' + url
3209         video_id = mobj.group('videoid')
3210
3211         webpage = self._download_webpage(url, video_id)
3212
3213         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3214         if mobj is None:
3215             self._downloader.report_error(u'unable to extract song name')
3216             return
3217         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3218         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3219         if mobj is None:
3220             self._downloader.report_error(u'unable to extract performer')
3221             return
3222         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3223         video_title = performer + ' - ' + song_name
3224
3225         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3226         if mobj is None:
3227             self._downloader.report_error(u'unable to mtvn_uri')
3228             return
3229         mtvn_uri = mobj.group(1)
3230
3231         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3232         if mobj is None:
3233             self._downloader.report_error(u'unable to extract content id')
3234             return
3235         content_id = mobj.group(1)
3236
3237         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3238         self.report_extraction(video_id)
3239         request = compat_urllib_request.Request(videogen_url)
3240         try:
3241             metadataXml = compat_urllib_request.urlopen(request).read()
3242         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3243             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3244             return
3245
3246         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3247         renditions = mdoc.findall('.//rendition')
3248
3249         # For now, always pick the highest quality.
3250         rendition = renditions[-1]
3251
3252         try:
3253             _,_,ext = rendition.attrib['type'].partition('/')
3254             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3255             video_url = rendition.find('./src').text
3256         except KeyError:
3257             self._downloader.trouble('Invalid rendition field.')
3258             return
3259
3260         info = {
3261             'id': video_id,
3262             'url': video_url,
3263             'uploader': performer,
3264             'upload_date': None,
3265             'title': video_title,
3266             'ext': ext,
3267             'format': format,
3268         }
3269
3270         return [info]
3271
3272
3273 class YoukuIE(InfoExtractor):
3274     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3275
3276     def report_download_webpage(self, file_id):
3277         """Report webpage download."""
3278         self.to_screen(u'%s: Downloading webpage' % file_id)
3279
3280     def report_extraction(self, file_id):
3281         """Report information extraction."""
3282         self.to_screen(u'%s: Extracting information' % file_id)
3283
3284     def _gen_sid(self):
3285         nowTime = int(time.time() * 1000)
3286         random1 = random.randint(1000,1998)
3287         random2 = random.randint(1000,9999)
3288
3289         return "%d%d%d" %(nowTime,random1,random2)
3290
3291     def _get_file_ID_mix_string(self, seed):
3292         mixed = []
3293         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3294         seed = float(seed)
3295         for i in range(len(source)):
3296             seed  =  (seed * 211 + 30031 ) % 65536
3297             index  =  math.floor(seed / 65536 * len(source) )
3298             mixed.append(source[int(index)])
3299             source.remove(source[int(index)])
3300         #return ''.join(mixed)
3301         return mixed
3302
3303     def _get_file_id(self, fileId, seed):
3304         mixed = self._get_file_ID_mix_string(seed)
3305         ids = fileId.split('*')
3306         realId = []
3307         for ch in ids:
3308             if ch:
3309                 realId.append(mixed[int(ch)])
3310         return ''.join(realId)
3311
3312     def _real_extract(self, url):
3313         mobj = re.match(self._VALID_URL, url)
3314         if mobj is None:
3315             self._downloader.report_error(u'invalid URL: %s' % url)
3316             return
3317         video_id = mobj.group('ID')
3318
3319         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3320
3321         request = compat_urllib_request.Request(info_url, None, std_headers)
3322         try:
3323             self.report_download_webpage(video_id)
3324             jsondata = compat_urllib_request.urlopen(request).read()
3325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3327             return
3328
3329         self.report_extraction(video_id)
3330         try:
3331             jsonstr = jsondata.decode('utf-8')
3332             config = json.loads(jsonstr)
3333
3334             video_title =  config['data'][0]['title']
3335             seed = config['data'][0]['seed']
3336
3337             format = self._downloader.params.get('format', None)
3338             supported_format = list(config['data'][0]['streamfileids'].keys())
3339
3340             if format is None or format == 'best':
3341                 if 'hd2' in supported_format:
3342                     format = 'hd2'
3343                 else:
3344                     format = 'flv'
3345                 ext = u'flv'
3346             elif format == 'worst':
3347                 format = 'mp4'
3348                 ext = u'mp4'
3349             else:
3350                 format = 'flv'
3351                 ext = u'flv'
3352
3353
3354             fileid = config['data'][0]['streamfileids'][format]
3355             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3356         except (UnicodeDecodeError, ValueError, KeyError):
3357             self._downloader.report_error(u'unable to extract info section')
3358             return
3359
3360         files_info=[]
3361         sid = self._gen_sid()
3362         fileid = self._get_file_id(fileid, seed)
3363
3364         #column 8,9 of fileid represent the segment number
3365         #fileid[7:9] should be changed
3366         for index, key in enumerate(keys):
3367
3368             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3369             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3370
3371             info = {
3372                 'id': '%s_part%02d' % (video_id, index),
3373                 'url': download_url,
3374                 'uploader': None,
3375                 'upload_date': None,
3376                 'title': video_title,
3377                 'ext': ext,
3378             }
3379             files_info.append(info)
3380
3381         return files_info
3382
3383
3384 class XNXXIE(InfoExtractor):
3385     """Information extractor for xnxx.com"""
3386
3387     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3388     IE_NAME = u'xnxx'
3389     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3390     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3391     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3392
3393     def report_webpage(self, video_id):
3394         """Report information extraction"""
3395         self.to_screen(u'%s: Downloading webpage' % video_id)
3396
3397     def report_extraction(self, video_id):
3398         """Report information extraction"""
3399         self.to_screen(u'%s: Extracting information' % video_id)
3400
3401     def _real_extract(self, url):
3402         mobj = re.match(self._VALID_URL, url)
3403         if mobj is None:
3404             self._downloader.report_error(u'invalid URL: %s' % url)
3405             return
3406         video_id = mobj.group(1)
3407
3408         self.report_webpage(video_id)
3409
3410         # Get webpage content
3411         try:
3412             webpage_bytes = compat_urllib_request.urlopen(url).read()
3413             webpage = webpage_bytes.decode('utf-8')
3414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3415             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3416             return
3417
3418         result = re.search(self.VIDEO_URL_RE, webpage)
3419         if result is None:
3420             self._downloader.report_error(u'unable to extract video url')
3421             return
3422         video_url = compat_urllib_parse.unquote(result.group(1))
3423
3424         result = re.search(self.VIDEO_TITLE_RE, webpage)
3425         if result is None:
3426             self._downloader.report_error(u'unable to extract video title')
3427             return
3428         video_title = result.group(1)
3429
3430         result = re.search(self.VIDEO_THUMB_RE, webpage)
3431         if result is None:
3432             self._downloader.report_error(u'unable to extract video thumbnail')
3433             return
3434         video_thumbnail = result.group(1)
3435
3436         return [{
3437             'id': video_id,
3438             'url': video_url,
3439             'uploader': None,
3440             'upload_date': None,
3441             'title': video_title,
3442             'ext': 'flv',
3443             'thumbnail': video_thumbnail,
3444             'description': None,
3445         }]
3446
3447
3448 class GooglePlusIE(InfoExtractor):
3449     """Information extractor for plus.google.com."""
3450
3451     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3452     IE_NAME = u'plus.google'
3453
3454     def __init__(self, downloader=None):
3455         InfoExtractor.__init__(self, downloader)
3456
3457     def report_extract_entry(self, url):
3458         """Report downloading extry"""
3459         self.to_screen(u'Downloading entry: %s' % url)
3460
3461     def report_date(self, upload_date):
3462         """Report downloading extry"""
3463         self.to_screen(u'Entry date: %s' % upload_date)
3464
3465     def report_uploader(self, uploader):
3466         """Report downloading extry"""
3467         self.to_screen(u'Uploader: %s' % uploader)
3468
3469     def report_title(self, video_title):
3470         """Report downloading extry"""
3471         self.to_screen(u'Title: %s' % video_title)
3472
3473     def report_extract_vid_page(self, video_page):
3474         """Report information extraction."""
3475         self.to_screen(u'Extracting video page: %s' % video_page)
3476
3477     def _real_extract(self, url):
3478         # Extract id from URL
3479         mobj = re.match(self._VALID_URL, url)
3480         if mobj is None:
3481             self._downloader.report_error(u'Invalid URL: %s' % url)
3482             return
3483
3484         post_url = mobj.group(0)
3485         video_id = mobj.group(1)
3486
3487         video_extension = 'flv'
3488
3489         # Step 1, Retrieve post webpage to extract further information
3490         self.report_extract_entry(post_url)
3491         request = compat_urllib_request.Request(post_url)
3492         try:
3493             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3495             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3496             return
3497
3498         # Extract update date
3499         upload_date = None
3500         pattern = 'title="Timestamp">(.*?)</a>'
3501         mobj = re.search(pattern, webpage)
3502         if mobj:
3503             upload_date = mobj.group(1)
3504             # Convert timestring to a format suitable for filename
3505             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3506             upload_date = upload_date.strftime('%Y%m%d')
3507         self.report_date(upload_date)
3508
3509         # Extract uploader
3510         uploader = None
3511         pattern = r'rel\="author".*?>(.*?)</a>'
3512         mobj = re.search(pattern, webpage)
3513         if mobj:
3514             uploader = mobj.group(1)
3515         self.report_uploader(uploader)
3516
3517         # Extract title
3518         # Get the first line for title
3519         video_title = u'NA'
3520         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3521         mobj = re.search(pattern, webpage)
3522         if mobj:
3523             video_title = mobj.group(1)
3524         self.report_title(video_title)
3525
3526         # Step 2, Stimulate clicking the image box to launch video
3527         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3528         mobj = re.search(pattern, webpage)
3529         if mobj is None:
3530             self._downloader.report_error(u'unable to extract video page URL')
3531
3532         video_page = mobj.group(1)
3533         request = compat_urllib_request.Request(video_page)
3534         try:
3535             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3537             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3538             return
3539         self.report_extract_vid_page(video_page)
3540
3541
3542         # Extract video links on video page
3543         """Extract video links of all sizes"""
3544         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3545         mobj = re.findall(pattern, webpage)
3546         if len(mobj) == 0:
3547             self._downloader.report_error(u'unable to extract video links')
3548
3549         # Sort in resolution
3550         links = sorted(mobj)
3551
3552         # Choose the lowest of the sort, i.e. highest resolution
3553         video_url = links[-1]
3554         # Only get the url. The resolution part in the tuple has no use anymore
3555         video_url = video_url[-1]
3556         # Treat escaped \u0026 style hex
3557         try:
3558             video_url = video_url.decode("unicode_escape")
3559         except AttributeError: # Python 3
3560             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3561
3562
3563         return [{
3564             'id':       video_id,
3565             'url':      video_url,
3566             'uploader': uploader,
3567             'upload_date':  upload_date,
3568             'title':    video_title,
3569             'ext':      video_extension,
3570         }]
3571
3572 class NBAIE(InfoExtractor):
3573     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3574     IE_NAME = u'nba'
3575
3576     def _real_extract(self, url):
3577         mobj = re.match(self._VALID_URL, url)
3578         if mobj is None:
3579             self._downloader.report_error(u'invalid URL: %s' % url)
3580             return
3581
3582         video_id = mobj.group(1)
3583         if video_id.endswith('/index.html'):
3584             video_id = video_id[:-len('/index.html')]
3585
3586         webpage = self._download_webpage(url, video_id)
3587
3588         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3589         def _findProp(rexp, default=None):
3590             m = re.search(rexp, webpage)
3591             if m:
3592                 return unescapeHTML(m.group(1))
3593             else:
3594                 return default
3595
3596         shortened_video_id = video_id.rpartition('/')[2]
3597         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3598         info = {
3599             'id': shortened_video_id,
3600             'url': video_url,
3601             'ext': 'mp4',
3602             'title': title,
3603             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3604             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3605         }
3606         return [info]
3607
3608 class JustinTVIE(InfoExtractor):
3609     """Information extractor for justin.tv and twitch.tv"""
3610     # TODO: One broadcast may be split into multiple videos. The key
3611     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3612     # starts at 1 and increases. Can we treat all parts as one video?
3613
3614     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3615         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3616     _JUSTIN_PAGE_LIMIT = 100
3617     IE_NAME = u'justin.tv'
3618
3619     def report_extraction(self, file_id):
3620         """Report information extraction."""
3621         self.to_screen(u'%s: Extracting information' % file_id)
3622
3623     def report_download_page(self, channel, offset):
3624         """Report attempt to download a single page of videos."""
3625         self.to_screen(u'%s: Downloading video information from %d to %d' %
3626                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3627
3628     # Return count of items, list of *valid* items
3629     def _parse_page(self, url):
3630         try:
3631             urlh = compat_urllib_request.urlopen(url)
3632             webpage_bytes = urlh.read()
3633             webpage = webpage_bytes.decode('utf-8', 'ignore')
3634         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3635             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3636             return
3637
3638         response = json.loads(webpage)
3639         if type(response) != list:
3640             error_text = response.get('error', 'unknown error')
3641             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3642             return
3643         info = []
3644         for clip in response:
3645             video_url = clip['video_file_url']
3646             if video_url:
3647                 video_extension = os.path.splitext(video_url)[1][1:]
3648                 video_date = re.sub('-', '', clip['start_time'][:10])
3649                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3650                 video_id = clip['id']
3651                 video_title = clip.get('title', video_id)
3652                 info.append({
3653                     'id': video_id,
3654                     'url': video_url,
3655                     'title': video_title,
3656                     'uploader': clip.get('channel_name', video_uploader_id),
3657                     'uploader_id': video_uploader_id,
3658                     'upload_date': video_date,
3659                     'ext': video_extension,
3660                 })
3661         return (len(response), info)
3662
3663     def _real_extract(self, url):
3664         mobj = re.match(self._VALID_URL, url)
3665         if mobj is None:
3666             self._downloader.report_error(u'invalid URL: %s' % url)
3667             return
3668
3669         api = 'http://api.justin.tv'
3670         video_id = mobj.group(mobj.lastindex)
3671         paged = False
3672         if mobj.lastindex == 1:
3673             paged = True
3674             api += '/channel/archives/%s.json'
3675         else:
3676             api += '/broadcast/by_archive/%s.json'
3677         api = api % (video_id,)
3678
3679         self.report_extraction(video_id)
3680
3681         info = []
3682         offset = 0
3683         limit = self._JUSTIN_PAGE_LIMIT
3684         while True:
3685             if paged:
3686                 self.report_download_page(video_id, offset)
3687             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3688             page_count, page_info = self._parse_page(page_url)
3689             info.extend(page_info)
3690             if not paged or page_count != limit:
3691                 break
3692             offset += limit
3693         return info
3694
3695 class FunnyOrDieIE(InfoExtractor):
3696     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3697
3698     def _real_extract(self, url):
3699         mobj = re.match(self._VALID_URL, url)
3700         if mobj is None:
3701             self._downloader.report_error(u'invalid URL: %s' % url)
3702             return
3703
3704         video_id = mobj.group('id')
3705         webpage = self._download_webpage(url, video_id)
3706
3707         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3708         if not m:
3709             self._downloader.report_error(u'unable to find video information')
3710         video_url = unescapeHTML(m.group('url'))
3711
3712         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3713         if not m:
3714             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3715             if not m:
3716                 self._downloader.trouble(u'Cannot find video title')
3717         title = clean_html(m.group('title'))
3718
3719         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3720         if m:
3721             desc = unescapeHTML(m.group('desc'))
3722         else:
3723             desc = None
3724
3725         info = {
3726             'id': video_id,
3727             'url': video_url,
3728             'ext': 'mp4',
3729             'title': title,
3730             'description': desc,
3731         }
3732         return [info]
3733
3734 class SteamIE(InfoExtractor):
3735     _VALID_URL = r"""http://store.steampowered.com/
3736                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3737                 (?P<gameID>\d+)/?
3738                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3739                 """
3740
3741     @classmethod
3742     def suitable(cls, url):
3743         """Receives a URL and returns True if suitable for this IE."""
3744         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3745
3746     def _real_extract(self, url):
3747         m = re.match(self._VALID_URL, url, re.VERBOSE)
3748         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3749         gameID = m.group('gameID')
3750         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3751         webpage = self._download_webpage(videourl, gameID)
3752         mweb = re.finditer(urlRE, webpage)
3753         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3754         titles = re.finditer(namesRE, webpage)
3755         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3756         thumbs = re.finditer(thumbsRE, webpage)
3757         videos = []
3758         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3759             video_id = vid.group('videoID')
3760             title = vtitle.group('videoName')
3761             video_url = vid.group('videoURL')
3762             video_thumb = thumb.group('thumbnail')
3763             if not video_url:
3764                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3765             info = {
3766                 'id':video_id,
3767                 'url':video_url,
3768                 'ext': 'flv',
3769                 'title': unescapeHTML(title),
3770                 'thumbnail': video_thumb
3771                   }
3772             videos.append(info)
3773         return videos
3774
3775 class UstreamIE(InfoExtractor):
3776     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3777     IE_NAME = u'ustream'
3778
3779     def _real_extract(self, url):
3780         m = re.match(self._VALID_URL, url)
3781         video_id = m.group('videoID')
3782         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3783         webpage = self._download_webpage(url, video_id)
3784         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3785         title = m.group('title')
3786         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3787         uploader = m.group('uploader')
3788         info = {
3789                 'id':video_id,
3790                 'url':video_url,
3791                 'ext': 'flv',
3792                 'title': title,
3793                 'uploader': uploader
3794                   }
3795         return [info]
3796
3797 class WorldStarHipHopIE(InfoExtractor):
3798     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3799     IE_NAME = u'WorldStarHipHop'
3800
3801     def _real_extract(self, url):
3802         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3803
3804         webpage_src = compat_urllib_request.urlopen(url).read()
3805         webpage_src = webpage_src.decode('utf-8')
3806
3807         mobj = re.search(_src_url, webpage_src)
3808
3809         m = re.match(self._VALID_URL, url)
3810         video_id = m.group('id')
3811
3812         if mobj is not None:
3813             video_url = mobj.group()
3814             if 'mp4' in video_url:
3815                 ext = 'mp4'
3816             else:
3817                 ext = 'flv'
3818         else:
3819             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3820             return
3821
3822         _title = r"""<title>(.*)</title>"""
3823
3824         mobj = re.search(_title, webpage_src)
3825
3826         if mobj is not None:
3827             title = mobj.group(1)
3828         else:
3829             title = 'World Start Hip Hop - %s' % time.ctime()
3830
3831         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3832         mobj = re.search(_thumbnail, webpage_src)
3833
3834         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3835         if mobj is not None:
3836             thumbnail = mobj.group(1)
3837         else:
3838             _title = r"""candytitles.*>(.*)</span>"""
3839             mobj = re.search(_title, webpage_src)
3840             if mobj is not None:
3841                 title = mobj.group(1)
3842             thumbnail = None
3843
3844         results = [{
3845                     'id': video_id,
3846                     'url' : video_url,
3847                     'title' : title,
3848                     'thumbnail' : thumbnail,
3849                     'ext' : ext,
3850                     }]
3851         return results
3852
3853 class RBMARadioIE(InfoExtractor):
3854     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3855
3856     def _real_extract(self, url):
3857         m = re.match(self._VALID_URL, url)
3858         video_id = m.group('videoID')
3859
3860         webpage = self._download_webpage(url, video_id)
3861         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3862         if not m:
3863             raise ExtractorError(u'Cannot find metadata')
3864         json_data = m.group(1)
3865
3866         try:
3867             data = json.loads(json_data)
3868         except ValueError as e:
3869             raise ExtractorError(u'Invalid JSON: ' + str(e))
3870
3871         video_url = data['akamai_url'] + '&cbr=256'
3872         url_parts = compat_urllib_parse_urlparse(video_url)
3873         video_ext = url_parts.path.rpartition('.')[2]
3874         info = {
3875                 'id': video_id,
3876                 'url': video_url,
3877                 'ext': video_ext,
3878                 'title': data['title'],
3879                 'description': data.get('teaser_text'),
3880                 'location': data.get('country_of_origin'),
3881                 'uploader': data.get('host', {}).get('name'),
3882                 'uploader_id': data.get('host', {}).get('slug'),
3883                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3884                 'duration': data.get('duration'),
3885         }
3886         return [info]
3887
3888
3889 class YouPornIE(InfoExtractor):
3890     """Information extractor for youporn.com."""
3891     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3892
3893     def _print_formats(self, formats):
3894         """Print all available formats"""
3895         print(u'Available formats:')
3896         print(u'ext\t\tformat')
3897         print(u'---------------------------------')
3898         for format in formats:
3899             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3900
3901     def _specific(self, req_format, formats):
3902         for x in formats:
3903             if(x["format"]==req_format):
3904                 return x
3905         return None
3906
3907     def _real_extract(self, url):
3908         mobj = re.match(self._VALID_URL, url)
3909         if mobj is None:
3910             self._downloader.report_error(u'invalid URL: %s' % url)
3911             return
3912
3913         video_id = mobj.group('videoid')
3914
3915         req = compat_urllib_request.Request(url)
3916         req.add_header('Cookie', 'age_verified=1')
3917         webpage = self._download_webpage(req, video_id)
3918
3919         # Get the video title
3920         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3921         if result is None:
3922             raise ExtractorError(u'Unable to extract video title')
3923         video_title = result.group('title').strip()
3924
3925         # Get the video date
3926         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3927         if result is None:
3928             self._downloader.report_warning(u'unable to extract video date')
3929             upload_date = None
3930         else:
3931             upload_date = result.group('date').strip()
3932
3933         # Get the video uploader
3934         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3935         if result is None:
3936             self._downloader.report_warning(u'unable to extract uploader')
3937             video_uploader = None
3938         else:
3939             video_uploader = result.group('uploader').strip()
3940             video_uploader = clean_html( video_uploader )
3941
3942         # Get all of the formats available
3943         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3944         result = re.search(DOWNLOAD_LIST_RE, webpage)
3945         if result is None:
3946             raise ExtractorError(u'Unable to extract download list')
3947         download_list_html = result.group('download_list').strip()
3948
3949         # Get all of the links from the page
3950         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3951         links = re.findall(LINK_RE, download_list_html)
3952         if(len(links) == 0):
3953             raise ExtractorError(u'ERROR: no known formats available for video')
3954
3955         self.to_screen(u'Links found: %d' % len(links))
3956
3957         formats = []
3958         for link in links:
3959
3960             # A link looks like this:
3961             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3962             # A path looks like this:
3963             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3964             video_url = unescapeHTML( link )
3965             path = compat_urllib_parse_urlparse( video_url ).path
3966             extension = os.path.splitext( path )[1][1:]
3967             format = path.split('/')[4].split('_')[:2]
3968             size = format[0]
3969             bitrate = format[1]
3970             format = "-".join( format )
3971             title = u'%s-%s-%s' % (video_title, size, bitrate)
3972
3973             formats.append({
3974                 'id': video_id,
3975                 'url': video_url,
3976                 'uploader': video_uploader,
3977                 'upload_date': upload_date,
3978                 'title': title,
3979                 'ext': extension,
3980                 'format': format,
3981                 'thumbnail': None,
3982                 'description': None,
3983                 'player_url': None
3984             })
3985
3986         if self._downloader.params.get('listformats', None):
3987             self._print_formats(formats)
3988             return
3989
3990         req_format = self._downloader.params.get('format', None)
3991         self.to_screen(u'Format: %s' % req_format)
3992
3993         if req_format is None or req_format == 'best':
3994             return [formats[0]]
3995         elif req_format == 'worst':
3996             return [formats[-1]]
3997         elif req_format in ('-1', 'all'):
3998             return formats
3999         else:
4000             format = self._specific( req_format, formats )
4001             if result is None:
4002                 self._downloader.report_error(u'requested format not available')
4003                 return
4004             return [format]
4005
4006
4007
4008 class PornotubeIE(InfoExtractor):
4009     """Information extractor for pornotube.com."""
4010     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4011
4012     def _real_extract(self, url):
4013         mobj = re.match(self._VALID_URL, url)
4014         if mobj is None:
4015             self._downloader.report_error(u'invalid URL: %s' % url)
4016             return
4017
4018         video_id = mobj.group('videoid')
4019         video_title = mobj.group('title')
4020
4021         # Get webpage content
4022         webpage = self._download_webpage(url, video_id)
4023
4024         # Get the video URL
4025         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4026         result = re.search(VIDEO_URL_RE, webpage)
4027         if result is None:
4028             self._downloader.report_error(u'unable to extract video url')
4029             return
4030         video_url = compat_urllib_parse.unquote(result.group('url'))
4031
4032         #Get the uploaded date
4033         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4034         result = re.search(VIDEO_UPLOADED_RE, webpage)
4035         if result is None:
4036             self._downloader.report_error(u'unable to extract video title')
4037             return
4038         upload_date = result.group('date')
4039
4040         info = {'id': video_id,
4041                 'url': video_url,
4042                 'uploader': None,
4043                 'upload_date': upload_date,
4044                 'title': video_title,
4045                 'ext': 'flv',
4046                 'format': 'flv'}
4047
4048         return [info]
4049
4050 class YouJizzIE(InfoExtractor):
4051     """Information extractor for youjizz.com."""
4052     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4053
4054     def _real_extract(self, url):
4055         mobj = re.match(self._VALID_URL, url)
4056         if mobj is None:
4057             self._downloader.report_error(u'invalid URL: %s' % url)
4058             return
4059
4060         video_id = mobj.group('videoid')
4061
4062         # Get webpage content
4063         webpage = self._download_webpage(url, video_id)
4064
4065         # Get the video title
4066         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4067         if result is None:
4068             raise ExtractorError(u'ERROR: unable to extract video title')
4069         video_title = result.group('title').strip()
4070
4071         # Get the embed page
4072         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4073         if result is None:
4074             raise ExtractorError(u'ERROR: unable to extract embed page')
4075
4076         embed_page_url = result.group(0).strip()
4077         video_id = result.group('videoid')
4078
4079         webpage = self._download_webpage(embed_page_url, video_id)
4080
4081         # Get the video URL
4082         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4083         if result is None:
4084             raise ExtractorError(u'ERROR: unable to extract video url')
4085         video_url = result.group('source')
4086
4087         info = {'id': video_id,
4088                 'url': video_url,
4089                 'title': video_title,
4090                 'ext': 'flv',
4091                 'format': 'flv',
4092                 'player_url': embed_page_url}
4093
4094         return [info]
4095
4096 class EightTracksIE(InfoExtractor):
4097     IE_NAME = '8tracks'
4098     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4099
4100     def _real_extract(self, url):
4101         mobj = re.match(self._VALID_URL, url)
4102         if mobj is None:
4103             raise ExtractorError(u'Invalid URL: %s' % url)
4104         playlist_id = mobj.group('id')
4105
4106         webpage = self._download_webpage(url, playlist_id)
4107
4108         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4109         if not m:
4110             raise ExtractorError(u'Cannot find trax information')
4111         json_like = m.group(1)
4112         data = json.loads(json_like)
4113
4114         session = str(random.randint(0, 1000000000))
4115         mix_id = data['id']
4116         track_count = data['tracks_count']
4117         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4118         next_url = first_url
4119         res = []
4120         for i in itertools.count():
4121             api_json = self._download_webpage(next_url, playlist_id,
4122                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4123                 errnote=u'Failed to download song information')
4124             api_data = json.loads(api_json)
4125             track_data = api_data[u'set']['track']
4126             info = {
4127                 'id': track_data['id'],
4128                 'url': track_data['track_file_stream_url'],
4129                 'title': track_data['performer'] + u' - ' + track_data['name'],
4130                 'raw_title': track_data['name'],
4131                 'uploader_id': data['user']['login'],
4132                 'ext': 'm4a',
4133             }
4134             res.append(info)
4135             if api_data['set']['at_last_track']:
4136                 break
4137             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4138         return res
4139
4140 class KeekIE(InfoExtractor):
4141     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4142     IE_NAME = u'keek'
4143
4144     def _real_extract(self, url):
4145         m = re.match(self._VALID_URL, url)
4146         video_id = m.group('videoID')
4147         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4148         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4149         webpage = self._download_webpage(url, video_id)
4150         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4151         title = unescapeHTML(m.group('title'))
4152         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4153         uploader = clean_html(m.group('uploader'))
4154         info = {
4155                 'id': video_id,
4156                 'url': video_url,
4157                 'ext': 'mp4',
4158                 'title': title,
4159                 'thumbnail': thumbnail,
4160                 'uploader': uploader
4161         }
4162         return [info]
4163
4164 class TEDIE(InfoExtractor):
4165     _VALID_URL=r'''http://www.ted.com/
4166                    (
4167                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4168                         |
4169                         ((?P<type_talk>talks)) # We have a simple talk
4170                    )
4171                    /(?P<name>\w+) # Here goes the name and then ".html"
4172                    '''
4173
4174     @classmethod
4175     def suitable(cls, url):
4176         """Receives a URL and returns True if suitable for this IE."""
4177         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4178
4179     def _real_extract(self, url):
4180         m=re.match(self._VALID_URL, url, re.VERBOSE)
4181         if m.group('type_talk'):
4182             return [self._talk_info(url)]
4183         else :
4184             playlist_id=m.group('playlist_id')
4185             name=m.group('name')
4186             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4187             return [self._playlist_videos_info(url,name,playlist_id)]
4188
4189     def _talk_video_link(self,mediaSlug):
4190         '''Returns the video link for that mediaSlug'''
4191         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4192
4193     def _playlist_videos_info(self,url,name,playlist_id=0):
4194         '''Returns the videos of the playlist'''
4195         video_RE=r'''
4196                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4197                      ([.\s]*?)data-playlist_item_id="(\d+)"
4198                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4199                      '''
4200         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4201         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4202         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4203         m_names=re.finditer(video_name_RE,webpage)
4204
4205         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4206         m_playlist = re.search(playlist_RE, webpage)
4207         playlist_title = m_playlist.group('playlist_title')
4208
4209         playlist_entries = []
4210         for m_video, m_name in zip(m_videos,m_names):
4211             video_id=m_video.group('video_id')
4212             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4213             playlist_entries.append(self.url_result(talk_url, 'TED'))
4214         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4215
4216     def _talk_info(self, url, video_id=0):
4217         """Return the video for the talk in the url"""
4218         m=re.match(self._VALID_URL, url,re.VERBOSE)
4219         videoName=m.group('name')
4220         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4221         # If the url includes the language we get the title translated
4222         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4223         title=re.search(title_RE, webpage).group('title')
4224         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4225                         "id":(?P<videoID>[\d]+).*?
4226                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4227         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4228         thumb_match=re.search(thumb_RE,webpage)
4229         info_match=re.search(info_RE,webpage,re.VERBOSE)
4230         video_id=info_match.group('videoID')
4231         mediaSlug=info_match.group('mediaSlug')
4232         video_url=self._talk_video_link(mediaSlug)
4233         info = {
4234                 'id': video_id,
4235                 'url': video_url,
4236                 'ext': 'mp4',
4237                 'title': title,
4238                 'thumbnail': thumb_match.group('thumbnail')
4239                 }
4240         return info
4241
4242 class MySpassIE(InfoExtractor):
4243     _VALID_URL = r'http://www.myspass.de/.*'
4244
4245     def _real_extract(self, url):
4246         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4247
4248         # video id is the last path element of the URL
4249         # usually there is a trailing slash, so also try the second but last
4250         url_path = compat_urllib_parse_urlparse(url).path
4251         url_parent_path, video_id = os.path.split(url_path)
4252         if not video_id:
4253             _, video_id = os.path.split(url_parent_path)
4254
4255         # get metadata
4256         metadata_url = META_DATA_URL_TEMPLATE % video_id
4257         metadata_text = self._download_webpage(metadata_url, video_id)
4258         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4259
4260         # extract values from metadata
4261         url_flv_el = metadata.find('url_flv')
4262         if url_flv_el is None:
4263             self._downloader.report_error(u'unable to extract download url')
4264             return
4265         video_url = url_flv_el.text
4266         extension = os.path.splitext(video_url)[1][1:]
4267         title_el = metadata.find('title')
4268         if title_el is None:
4269             self._downloader.report_error(u'unable to extract title')
4270             return
4271         title = title_el.text
4272         format_id_el = metadata.find('format_id')
4273         if format_id_el is None:
4274             format = ext
4275         else:
4276             format = format_id_el.text
4277         description_el = metadata.find('description')
4278         if description_el is not None:
4279             description = description_el.text
4280         else:
4281             description = None
4282         imagePreview_el = metadata.find('imagePreview')
4283         if imagePreview_el is not None:
4284             thumbnail = imagePreview_el.text
4285         else:
4286             thumbnail = None
4287         info = {
4288             'id': video_id,
4289             'url': video_url,
4290             'title': title,
4291             'ext': extension,
4292             'format': format,
4293             'thumbnail': thumbnail,
4294             'description': description
4295         }
4296         return [info]
4297
4298 class SpiegelIE(InfoExtractor):
4299     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4300
4301     def _real_extract(self, url):
4302         m = re.match(self._VALID_URL, url)
4303         video_id = m.group('videoID')
4304
4305         webpage = self._download_webpage(url, video_id)
4306         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4307         if not m:
4308             raise ExtractorError(u'Cannot find title')
4309         video_title = unescapeHTML(m.group(1))
4310
4311         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4312         xml_code = self._download_webpage(xml_url, video_id,
4313                     note=u'Downloading XML', errnote=u'Failed to download XML')
4314
4315         idoc = xml.etree.ElementTree.fromstring(xml_code)
4316         last_type = idoc[-1]
4317         filename = last_type.findall('./filename')[0].text
4318         duration = float(last_type.findall('./duration')[0].text)
4319
4320         video_url = 'http://video2.spiegel.de/flash/' + filename
4321         video_ext = filename.rpartition('.')[2]
4322         info = {
4323             'id': video_id,
4324             'url': video_url,
4325             'ext': video_ext,
4326             'title': video_title,
4327             'duration': duration,
4328         }
4329         return [info]
4330
4331 class LiveLeakIE(InfoExtractor):
4332
4333     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4334     IE_NAME = u'liveleak'
4335
4336     def _real_extract(self, url):
4337         mobj = re.match(self._VALID_URL, url)
4338         if mobj is None:
4339             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4340             return
4341
4342         video_id = mobj.group('video_id')
4343
4344         webpage = self._download_webpage(url, video_id)
4345
4346         m = re.search(r'file: "(.*?)",', webpage)
4347         if not m:
4348             self._downloader.report_error(u'unable to find video url')
4349             return
4350         video_url = m.group(1)
4351
4352         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4353         if not m:
4354             self._downloader.trouble(u'Cannot find video title')
4355         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4356
4357         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4358         if m:
4359             desc = unescapeHTML(m.group('desc'))
4360         else:
4361             desc = None
4362
4363         m = re.search(r'By:.*?(\w+)</a>', webpage)
4364         if m:
4365             uploader = clean_html(m.group(1))
4366         else:
4367             uploader = None
4368
4369         info = {
4370             'id':  video_id,
4371             'url': video_url,
4372             'ext': 'mp4',
4373             'title': title,
4374             'description': desc,
4375             'uploader': uploader
4376         }
4377
4378         return [info]
4379
4380 class ARDIE(InfoExtractor):
4381     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4382     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4383     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4384
4385     def _real_extract(self, url):
4386         # determine video id from url
4387         m = re.match(self._VALID_URL, url)
4388
4389         numid = re.search(r'documentId=([0-9]+)', url)
4390         if numid:
4391             video_id = numid.group(1)
4392         else:
4393             video_id = m.group('video_id')
4394
4395         # determine title and media streams from webpage
4396         html = self._download_webpage(url, video_id)
4397         title = re.search(self._TITLE, html).group('title')
4398         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4399         if not streams:
4400             assert '"fsk"' in html
4401             self._downloader.report_error(u'this video is only available after 8:00 pm')
4402             return
4403
4404         # choose default media type and highest quality for now
4405         stream = max([s for s in streams if int(s["media_type"]) == 0],
4406                      key=lambda s: int(s["quality"]))
4407
4408         # there's two possibilities: RTMP stream or HTTP download
4409         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4410         if stream['rtmp_url']:
4411             self.to_screen(u'RTMP download detected')
4412             assert stream['video_url'].startswith('mp4:')
4413             info["url"] = stream["rtmp_url"]
4414             info["play_path"] = stream['video_url']
4415         else:
4416             assert stream["video_url"].endswith('.mp4')
4417             info["url"] = stream["video_url"]
4418         return [info]
4419
4420
4421 def gen_extractors():
4422     """ Return a list of an instance of every supported extractor.
4423     The order does matter; the first extractor matched is the one handling the URL.
4424     """
4425     return [
4426         YoutubePlaylistIE(),
4427         YoutubeChannelIE(),
4428         YoutubeUserIE(),
4429         YoutubeSearchIE(),
4430         YoutubeIE(),
4431         MetacafeIE(),
4432         DailymotionIE(),
4433         GoogleSearchIE(),
4434         PhotobucketIE(),
4435         YahooIE(),
4436         YahooSearchIE(),
4437         DepositFilesIE(),
4438         FacebookIE(),
4439         BlipTVUserIE(),
4440         BlipTVIE(),
4441         VimeoIE(),
4442         MyVideoIE(),
4443         ComedyCentralIE(),
4444         EscapistIE(),
4445         CollegeHumorIE(),
4446         XVideosIE(),
4447         SoundcloudSetIE(),
4448         SoundcloudIE(),
4449         InfoQIE(),
4450         MixcloudIE(),
4451         StanfordOpenClassroomIE(),
4452         MTVIE(),
4453         YoukuIE(),
4454         XNXXIE(),
4455         YouJizzIE(),
4456         PornotubeIE(),
4457         YouPornIE(),
4458         GooglePlusIE(),
4459         ArteTvIE(),
4460         NBAIE(),
4461         WorldStarHipHopIE(),
4462         JustinTVIE(),
4463         FunnyOrDieIE(),
4464         SteamIE(),
4465         UstreamIE(),
4466         RBMARadioIE(),
4467         EightTracksIE(),
4468         KeekIE(),
4469         TEDIE(),
4470         MySpassIE(),
4471         SpiegelIE(),
4472         LiveLeakIE(),
4473         ARDIE(),
4474         GenericIE()
4475     ]
4476
4477 def get_info_extractor(ie_name):
4478     """Returns the info extractor class with the given ie_name"""
4479     return globals()[ie_name+'IE']