Merge pull request #752 from dodo/master
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
228
229     def report_video_subtitles_request(self, video_id, sub_lang, format):
230         """Report attempt to download video info webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
232
233     def report_video_subtitles_available(self, video_id, sub_lang_list):
234         """Report available subtitles."""
235         sub_lang = ",".join(list(sub_lang_list.keys()))
236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
237
238     def report_information_extraction(self, video_id):
239         """Report attempt to extract video information."""
240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
241
242     def report_unavailable_format(self, video_id, format):
243         """Report extracted video URL."""
244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
245
246     def report_rtmp_download(self):
247         """Indicate the download will use the RTMP protocol."""
248         self._downloader.to_screen(u'[youtube] RTMP download detected')
249
250     def _get_available_subtitles(self, video_id):
251         self.report_video_subtitles_download(video_id)
252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253         try:
254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259         if not sub_lang_list:
260             return (u'WARNING: video doesn\'t have subtitles', None)
261         return sub_lang_list
262
263     def _list_available_subtitles(self, video_id):
264         sub_lang_list = self._get_available_subtitles(video_id)
265         self.report_video_subtitles_available(video_id, sub_lang_list)
266
267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268         self.report_video_subtitles_request(video_id, sub_lang, format)
269         params = compat_urllib_parse.urlencode({
270             'lang': sub_lang,
271             'name': sub_name,
272             'v': video_id,
273             'fmt': format,
274         })
275         url = 'http://www.youtube.com/api/timedtext?' + params
276         try:
277             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
280         if not sub:
281             return (u'WARNING: Did not fetch video subtitles', None)
282         return (None, sub_lang, sub)
283
284     def _extract_subtitle(self, video_id):
285         sub_lang_list = self._get_available_subtitles(video_id)
286         sub_format = self._downloader.params.get('subtitlesformat')
287         if self._downloader.params.get('subtitleslang', False):
288             sub_lang = self._downloader.params.get('subtitleslang')
289         elif 'en' in sub_lang_list:
290             sub_lang = 'en'
291         else:
292             sub_lang = list(sub_lang_list.keys())[0]
293         if not sub_lang in sub_lang_list:
294             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
295
296         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
297         return [subtitle]
298
299     def _extract_all_subtitles(self, video_id):
300         sub_lang_list = self._get_available_subtitles(video_id)
301         sub_format = self._downloader.params.get('subtitlesformat')
302         subtitles = []
303         for sub_lang in sub_lang_list:
304             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
305             subtitles.append(subtitle)
306         return subtitles
307
308     def _print_formats(self, formats):
309         print('Available formats:')
310         for x in formats:
311             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
312
313     def _real_initialize(self):
314         if self._downloader is None:
315             return
316
317         username = None
318         password = None
319         downloader_params = self._downloader.params
320
321         # Attempt to use provided username and password or .netrc data
322         if downloader_params.get('username', None) is not None:
323             username = downloader_params['username']
324             password = downloader_params['password']
325         elif downloader_params.get('usenetrc', False):
326             try:
327                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
328                 if info is not None:
329                     username = info[0]
330                     password = info[2]
331                 else:
332                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
333             except (IOError, netrc.NetrcParseError) as err:
334                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
335                 return
336
337         # Set language
338         request = compat_urllib_request.Request(self._LANG_URL)
339         try:
340             self.report_lang()
341             compat_urllib_request.urlopen(request).read()
342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
344             return
345
346         # No authentication to be performed
347         if username is None:
348             return
349
350         request = compat_urllib_request.Request(self._LOGIN_URL)
351         try:
352             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
354             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
355             return
356
357         galx = None
358         dsh = None
359         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
360         if match:
361           galx = match.group(1)
362
363         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
364         if match:
365           dsh = match.group(1)
366
367         # Log in
368         login_form_strs = {
369                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
370                 u'Email': username,
371                 u'GALX': galx,
372                 u'Passwd': password,
373                 u'PersistentCookie': u'yes',
374                 u'_utf8': u'霱',
375                 u'bgresponse': u'js_disabled',
376                 u'checkConnection': u'',
377                 u'checkedDomains': u'youtube',
378                 u'dnConn': u'',
379                 u'dsh': dsh,
380                 u'pstMsg': u'0',
381                 u'rmShown': u'1',
382                 u'secTok': u'',
383                 u'signIn': u'Sign in',
384                 u'timeStmp': u'',
385                 u'service': u'youtube',
386                 u'uilel': u'3',
387                 u'hl': u'en_US',
388         }
389         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
390         # chokes on unicode
391         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
392         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
393         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
394         try:
395             self.report_login()
396             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
397             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
398                 self._downloader.report_warning(u'unable to log in: bad username or password')
399                 return
400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
401             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
402             return
403
404         # Confirm age
405         age_form = {
406                 'next_url':     '/',
407                 'action_confirm':   'Confirm',
408                 }
409         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
410         try:
411             self.report_age_confirmation()
412             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
415             return
416
417     def _extract_id(self, url):
418         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
419         if mobj is None:
420             self._downloader.report_error(u'invalid URL: %s' % url)
421             return
422         video_id = mobj.group(2)
423         return video_id
424
425     def _real_extract(self, url):
426         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
427         mobj = re.search(self._NEXT_URL_RE, url)
428         if mobj:
429             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
430         video_id = self._extract_id(url)
431
432         # Get video webpage
433         self.report_video_webpage_download(video_id)
434         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
435         request = compat_urllib_request.Request(url)
436         try:
437             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
438         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
439             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
440             return
441
442         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
443
444         # Attempt to extract SWF player URL
445         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
446         if mobj is not None:
447             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
448         else:
449             player_url = None
450
451         # Get video info
452         self.report_video_info_webpage_download(video_id)
453         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
454             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
455                     % (video_id, el_type))
456             request = compat_urllib_request.Request(video_info_url)
457             try:
458                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
459                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
460                 video_info = compat_parse_qs(video_info_webpage)
461                 if 'token' in video_info:
462                     break
463             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
464                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
465                 return
466         if 'token' not in video_info:
467             if 'reason' in video_info:
468                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
469             else:
470                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
471             return
472
473         # Check for "rental" videos
474         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
475             self._downloader.report_error(u'"rental" videos not supported')
476             return
477
478         # Start extracting information
479         self.report_information_extraction(video_id)
480
481         # uploader
482         if 'author' not in video_info:
483             self._downloader.report_error(u'unable to extract uploader name')
484             return
485         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
486
487         # uploader_id
488         video_uploader_id = None
489         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
490         if mobj is not None:
491             video_uploader_id = mobj.group(1)
492         else:
493             self._downloader.report_warning(u'unable to extract uploader nickname')
494
495         # title
496         if 'title' not in video_info:
497             self._downloader.report_error(u'unable to extract video title')
498             return
499         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
500
501         # thumbnail image
502         if 'thumbnail_url' not in video_info:
503             self._downloader.report_warning(u'unable to extract video thumbnail')
504             video_thumbnail = ''
505         else:   # don't panic if we can't find it
506             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
507
508         # upload date
509         upload_date = None
510         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
511         if mobj is not None:
512             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
513             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
514             for expression in format_expressions:
515                 try:
516                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
517                 except:
518                     pass
519
520         # description
521         video_description = get_element_by_id("eow-description", video_webpage)
522         if video_description:
523             video_description = clean_html(video_description)
524         else:
525             video_description = ''
526
527         # subtitles
528         video_subtitles = None
529
530         if self._downloader.params.get('writesubtitles', False):
531             video_subtitles = self._extract_subtitle(video_id)
532             if video_subtitles:
533                 (sub_error, sub_lang, sub) = video_subtitles[0]
534                 if sub_error:
535                     self._downloader.trouble(sub_error)
536
537         if self._downloader.params.get('allsubtitles', False):
538             video_subtitles = self._extract_all_subtitles(video_id)
539             for video_subtitle in video_subtitles:
540                 (sub_error, sub_lang, sub) = video_subtitle
541                 if sub_error:
542                     self._downloader.trouble(sub_error)
543
544         if self._downloader.params.get('listsubtitles', False):
545             sub_lang_list = self._list_available_subtitles(video_id)
546             return
547
548         if 'length_seconds' not in video_info:
549             self._downloader.report_warning(u'unable to extract video duration')
550             video_duration = ''
551         else:
552             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
553
554         # token
555         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
556
557         # Decide which formats to download
558         req_format = self._downloader.params.get('format', None)
559
560         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
561             self.report_rtmp_download()
562             video_url_list = [(None, video_info['conn'][0])]
563         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
564             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
565             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
566             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
567             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
568
569             format_limit = self._downloader.params.get('format_limit', None)
570             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
571             if format_limit is not None and format_limit in available_formats:
572                 format_list = available_formats[available_formats.index(format_limit):]
573             else:
574                 format_list = available_formats
575             existing_formats = [x for x in format_list if x in url_map]
576             if len(existing_formats) == 0:
577                 self._downloader.report_error(u'no known formats available for video')
578                 return
579             if self._downloader.params.get('listformats', None):
580                 self._print_formats(existing_formats)
581                 return
582             if req_format is None or req_format == 'best':
583                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
584             elif req_format == 'worst':
585                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
586             elif req_format in ('-1', 'all'):
587                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
588             else:
589                 # Specific formats. We pick the first in a slash-delimeted sequence.
590                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
591                 req_formats = req_format.split('/')
592                 video_url_list = None
593                 for rf in req_formats:
594                     if rf in url_map:
595                         video_url_list = [(rf, url_map[rf])]
596                         break
597                 if video_url_list is None:
598                     self._downloader.report_error(u'requested format not available')
599                     return
600         else:
601             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
602             return
603
604         results = []
605         for format_param, video_real_url in video_url_list:
606             # Extension
607             video_extension = self._video_extensions.get(format_param, 'flv')
608
609             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
610                                               self._video_dimensions.get(format_param, '???'))
611
612             results.append({
613                 'id':       video_id,
614                 'url':      video_real_url,
615                 'uploader': video_uploader,
616                 'uploader_id': video_uploader_id,
617                 'upload_date':  upload_date,
618                 'title':    video_title,
619                 'ext':      video_extension,
620                 'format':   video_format,
621                 'thumbnail':    video_thumbnail,
622                 'description':  video_description,
623                 'player_url':   player_url,
624                 'subtitles':    video_subtitles,
625                 'duration':     video_duration
626             })
627         return results
628
629
630 class MetacafeIE(InfoExtractor):
631     """Information Extractor for metacafe.com."""
632
633     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
634     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
635     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
636     IE_NAME = u'metacafe'
637
638     def __init__(self, downloader=None):
639         InfoExtractor.__init__(self, downloader)
640
641     def report_disclaimer(self):
642         """Report disclaimer retrieval."""
643         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
644
645     def report_age_confirmation(self):
646         """Report attempt to confirm age."""
647         self._downloader.to_screen(u'[metacafe] Confirming age')
648
649     def report_download_webpage(self, video_id):
650         """Report webpage download."""
651         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
652
653     def report_extraction(self, video_id):
654         """Report information extraction."""
655         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
656
657     def _real_initialize(self):
658         # Retrieve disclaimer
659         request = compat_urllib_request.Request(self._DISCLAIMER)
660         try:
661             self.report_disclaimer()
662             disclaimer = compat_urllib_request.urlopen(request).read()
663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
665             return
666
667         # Confirm age
668         disclaimer_form = {
669             'filters': '0',
670             'submit': "Continue - I'm over 18",
671             }
672         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
673         try:
674             self.report_age_confirmation()
675             disclaimer = compat_urllib_request.urlopen(request).read()
676         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
677             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
678             return
679
680     def _real_extract(self, url):
681         # Extract id and simplified title from URL
682         mobj = re.match(self._VALID_URL, url)
683         if mobj is None:
684             self._downloader.report_error(u'invalid URL: %s' % url)
685             return
686
687         video_id = mobj.group(1)
688
689         # Check if video comes from YouTube
690         mobj2 = re.match(r'^yt-(.*)$', video_id)
691         if mobj2 is not None:
692             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
693             return
694
695         # Retrieve video webpage to extract further information
696         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
697         try:
698             self.report_download_webpage(video_id)
699             webpage = compat_urllib_request.urlopen(request).read()
700         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
701             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
702             return
703
704         # Extract URL, uploader and title from webpage
705         self.report_extraction(video_id)
706         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
707         if mobj is not None:
708             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
709             video_extension = mediaURL[-3:]
710
711             # Extract gdaKey if available
712             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
713             if mobj is None:
714                 video_url = mediaURL
715             else:
716                 gdaKey = mobj.group(1)
717                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
718         else:
719             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
720             if mobj is None:
721                 self._downloader.report_error(u'unable to extract media URL')
722                 return
723             vardict = compat_parse_qs(mobj.group(1))
724             if 'mediaData' not in vardict:
725                 self._downloader.report_error(u'unable to extract media URL')
726                 return
727             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
728             if mobj is None:
729                 self._downloader.report_error(u'unable to extract media URL')
730                 return
731             mediaURL = mobj.group(1).replace('\\/', '/')
732             video_extension = mediaURL[-3:]
733             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
734
735         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
736         if mobj is None:
737             self._downloader.report_error(u'unable to extract title')
738             return
739         video_title = mobj.group(1).decode('utf-8')
740
741         mobj = re.search(r'submitter=(.*?);', webpage)
742         if mobj is None:
743             self._downloader.report_error(u'unable to extract uploader nickname')
744             return
745         video_uploader = mobj.group(1)
746
747         return [{
748             'id':       video_id.decode('utf-8'),
749             'url':      video_url.decode('utf-8'),
750             'uploader': video_uploader.decode('utf-8'),
751             'upload_date':  None,
752             'title':    video_title,
753             'ext':      video_extension.decode('utf-8'),
754         }]
755
756
757 class DailymotionIE(InfoExtractor):
758     """Information Extractor for Dailymotion"""
759
760     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
761     IE_NAME = u'dailymotion'
762     _WORKING = False
763
764     def __init__(self, downloader=None):
765         InfoExtractor.__init__(self, downloader)
766
767     def report_extraction(self, video_id):
768         """Report information extraction."""
769         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
770
771     def _real_extract(self, url):
772         # Extract id and simplified title from URL
773         mobj = re.match(self._VALID_URL, url)
774         if mobj is None:
775             self._downloader.report_error(u'invalid URL: %s' % url)
776             return
777
778         video_id = mobj.group(1).split('_')[0].split('?')[0]
779
780         video_extension = 'mp4'
781
782         # Retrieve video webpage to extract further information
783         request = compat_urllib_request.Request(url)
784         request.add_header('Cookie', 'family_filter=off')
785         webpage = self._download_webpage(request, video_id)
786
787         # Extract URL, uploader and title from webpage
788         self.report_extraction(video_id)
789         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
790         if mobj is None:
791             self._downloader.report_error(u'unable to extract media URL')
792             return
793         flashvars = compat_urllib_parse.unquote(mobj.group(1))
794
795         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
796             if key in flashvars:
797                 max_quality = key
798                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
799                 break
800         else:
801             self._downloader.report_error(u'unable to extract video URL')
802             return
803
804         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
805         if mobj is None:
806             self._downloader.report_error(u'unable to extract video URL')
807             return
808
809         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
810
811         # TODO: support choosing qualities
812
813         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
814         if mobj is None:
815             self._downloader.report_error(u'unable to extract title')
816             return
817         video_title = unescapeHTML(mobj.group('title'))
818
819         video_uploader = None
820         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
821         if mobj is None:
822             # lookin for official user
823             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
824             if mobj_official is None:
825                 self._downloader.report_warning(u'unable to extract uploader nickname')
826             else:
827                 video_uploader = mobj_official.group(1)
828         else:
829             video_uploader = mobj.group(1)
830
831         video_upload_date = None
832         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
833         if mobj is not None:
834             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
835
836         return [{
837             'id':       video_id,
838             'url':      video_url,
839             'uploader': video_uploader,
840             'upload_date':  video_upload_date,
841             'title':    video_title,
842             'ext':      video_extension,
843         }]
844
845
846 class PhotobucketIE(InfoExtractor):
847     """Information extractor for photobucket.com."""
848
849     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
850     IE_NAME = u'photobucket'
851
852     def __init__(self, downloader=None):
853         InfoExtractor.__init__(self, downloader)
854
855     def report_download_webpage(self, video_id):
856         """Report webpage download."""
857         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
858
859     def report_extraction(self, video_id):
860         """Report information extraction."""
861         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
862
863     def _real_extract(self, url):
864         # Extract id from URL
865         mobj = re.match(self._VALID_URL, url)
866         if mobj is None:
867             self._downloader.report_error(u'Invalid URL: %s' % url)
868             return
869
870         video_id = mobj.group(1)
871
872         video_extension = 'flv'
873
874         # Retrieve video webpage to extract further information
875         request = compat_urllib_request.Request(url)
876         try:
877             self.report_download_webpage(video_id)
878             webpage = compat_urllib_request.urlopen(request).read()
879         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
880             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
881             return
882
883         # Extract URL, uploader, and title from webpage
884         self.report_extraction(video_id)
885         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
886         if mobj is None:
887             self._downloader.report_error(u'unable to extract media URL')
888             return
889         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
890
891         video_url = mediaURL
892
893         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
894         if mobj is None:
895             self._downloader.report_error(u'unable to extract title')
896             return
897         video_title = mobj.group(1).decode('utf-8')
898
899         video_uploader = mobj.group(2).decode('utf-8')
900
901         return [{
902             'id':       video_id.decode('utf-8'),
903             'url':      video_url.decode('utf-8'),
904             'uploader': video_uploader,
905             'upload_date':  None,
906             'title':    video_title,
907             'ext':      video_extension.decode('utf-8'),
908         }]
909
910
911 class YahooIE(InfoExtractor):
912     """Information extractor for video.yahoo.com."""
913
914     _WORKING = False
915     # _VALID_URL matches all Yahoo! Video URLs
916     # _VPAGE_URL matches only the extractable '/watch/' URLs
917     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919     IE_NAME = u'video.yahoo'
920
921     def __init__(self, downloader=None):
922         InfoExtractor.__init__(self, downloader)
923
924     def report_download_webpage(self, video_id):
925         """Report webpage download."""
926         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
927
928     def report_extraction(self, video_id):
929         """Report information extraction."""
930         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
931
932     def _real_extract(self, url, new_video=True):
933         # Extract ID from URL
934         mobj = re.match(self._VALID_URL, url)
935         if mobj is None:
936             self._downloader.report_error(u'Invalid URL: %s' % url)
937             return
938
939         video_id = mobj.group(2)
940         video_extension = 'flv'
941
942         # Rewrite valid but non-extractable URLs as
943         # extractable English language /watch/ URLs
944         if re.match(self._VPAGE_URL, url) is None:
945             request = compat_urllib_request.Request(url)
946             try:
947                 webpage = compat_urllib_request.urlopen(request).read()
948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
950                 return
951
952             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
953             if mobj is None:
954                 self._downloader.report_error(u'Unable to extract id field')
955                 return
956             yahoo_id = mobj.group(1)
957
958             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
959             if mobj is None:
960                 self._downloader.report_error(u'Unable to extract vid field')
961                 return
962             yahoo_vid = mobj.group(1)
963
964             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965             return self._real_extract(url, new_video=False)
966
967         # Retrieve video webpage to extract further information
968         request = compat_urllib_request.Request(url)
969         try:
970             self.report_download_webpage(video_id)
971             webpage = compat_urllib_request.urlopen(request).read()
972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
974             return
975
976         # Extract uploader and title from webpage
977         self.report_extraction(video_id)
978         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
979         if mobj is None:
980             self._downloader.report_error(u'unable to extract video title')
981             return
982         video_title = mobj.group(1).decode('utf-8')
983
984         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
985         if mobj is None:
986             self._downloader.report_error(u'unable to extract video uploader')
987             return
988         video_uploader = mobj.group(1).decode('utf-8')
989
990         # Extract video thumbnail
991         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
992         if mobj is None:
993             self._downloader.report_error(u'unable to extract video thumbnail')
994             return
995         video_thumbnail = mobj.group(1).decode('utf-8')
996
997         # Extract video description
998         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video description')
1001             return
1002         video_description = mobj.group(1).decode('utf-8')
1003         if not video_description:
1004             video_description = 'No description available.'
1005
1006         # Extract video height and width
1007         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video height')
1010             return
1011         yv_video_height = mobj.group(1)
1012
1013         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video width')
1016             return
1017         yv_video_width = mobj.group(1)
1018
1019         # Retrieve video playlist to extract media URL
1020         # I'm not completely sure what all these options are, but we
1021         # seem to need most of them, otherwise the server sends a 401.
1022         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1023         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1024         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1027         try:
1028             self.report_download_webpage(video_id)
1029             webpage = compat_urllib_request.urlopen(request).read()
1030         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1032             return
1033
1034         # Extract media URL from playlist XML
1035         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'Unable to extract media URL')
1038             return
1039         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040         video_url = unescapeHTML(video_url)
1041
1042         return [{
1043             'id':       video_id.decode('utf-8'),
1044             'url':      video_url,
1045             'uploader': video_uploader,
1046             'upload_date':  None,
1047             'title':    video_title,
1048             'ext':      video_extension.decode('utf-8'),
1049             'thumbnail':    video_thumbnail.decode('utf-8'),
1050             'description':  video_description,
1051         }]
1052
1053
1054 class VimeoIE(InfoExtractor):
1055     """Information extractor for vimeo.com."""
1056
1057     # _VALID_URL matches Vimeo URLs
1058     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1059     IE_NAME = u'vimeo'
1060
1061     def __init__(self, downloader=None):
1062         InfoExtractor.__init__(self, downloader)
1063
1064     def report_download_webpage(self, video_id):
1065         """Report webpage download."""
1066         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1067
1068     def report_extraction(self, video_id):
1069         """Report information extraction."""
1070         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1071
1072     def _real_extract(self, url, new_video=True):
1073         # Extract ID from URL
1074         mobj = re.match(self._VALID_URL, url)
1075         if mobj is None:
1076             self._downloader.report_error(u'Invalid URL: %s' % url)
1077             return
1078
1079         video_id = mobj.group('id')
1080         if not mobj.group('proto'):
1081             url = 'https://' + url
1082         if mobj.group('direct_link'):
1083             url = 'https://vimeo.com/' + video_id
1084
1085         # Retrieve video webpage to extract further information
1086         request = compat_urllib_request.Request(url, None, std_headers)
1087         try:
1088             self.report_download_webpage(video_id)
1089             webpage_bytes = compat_urllib_request.urlopen(request).read()
1090             webpage = webpage_bytes.decode('utf-8')
1091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1092             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1093             return
1094
1095         # Now we begin extracting as much information as we can from what we
1096         # retrieved. First we extract the information common to all extractors,
1097         # and latter we extract those that are Vimeo specific.
1098         self.report_extraction(video_id)
1099
1100         # Extract the config JSON
1101         try:
1102             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1103             config = json.loads(config)
1104         except:
1105             self._downloader.report_error(u'unable to extract info section')
1106             return
1107
1108         # Extract title
1109         video_title = config["video"]["title"]
1110
1111         # Extract uploader and uploader_id
1112         video_uploader = config["video"]["owner"]["name"]
1113         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1114
1115         # Extract video thumbnail
1116         video_thumbnail = config["video"]["thumbnail"]
1117
1118         # Extract video description
1119         video_description = get_element_by_attribute("itemprop", "description", webpage)
1120         if video_description: video_description = clean_html(video_description)
1121         else: video_description = ''
1122
1123         # Extract upload date
1124         video_upload_date = None
1125         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1126         if mobj is not None:
1127             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1128
1129         # Vimeo specific: extract request signature and timestamp
1130         sig = config['request']['signature']
1131         timestamp = config['request']['timestamp']
1132
1133         # Vimeo specific: extract video codec and quality information
1134         # First consider quality, then codecs, then take everything
1135         # TODO bind to format param
1136         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1137         files = { 'hd': [], 'sd': [], 'other': []}
1138         for codec_name, codec_extension in codecs:
1139             if codec_name in config["video"]["files"]:
1140                 if 'hd' in config["video"]["files"][codec_name]:
1141                     files['hd'].append((codec_name, codec_extension, 'hd'))
1142                 elif 'sd' in config["video"]["files"][codec_name]:
1143                     files['sd'].append((codec_name, codec_extension, 'sd'))
1144                 else:
1145                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1146
1147         for quality in ('hd', 'sd', 'other'):
1148             if len(files[quality]) > 0:
1149                 video_quality = files[quality][0][2]
1150                 video_codec = files[quality][0][0]
1151                 video_extension = files[quality][0][1]
1152                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1153                 break
1154         else:
1155             self._downloader.report_error(u'no known codec found')
1156             return
1157
1158         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1159                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1160
1161         return [{
1162             'id':       video_id,
1163             'url':      video_url,
1164             'uploader': video_uploader,
1165             'uploader_id': video_uploader_id,
1166             'upload_date':  video_upload_date,
1167             'title':    video_title,
1168             'ext':      video_extension,
1169             'thumbnail':    video_thumbnail,
1170             'description':  video_description,
1171         }]
1172
1173
1174 class ArteTvIE(InfoExtractor):
1175     """arte.tv information extractor."""
1176
1177     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1178     _LIVE_URL = r'index-[0-9]+\.html$'
1179
1180     IE_NAME = u'arte.tv'
1181
1182     def __init__(self, downloader=None):
1183         InfoExtractor.__init__(self, downloader)
1184
1185     def report_download_webpage(self, video_id):
1186         """Report webpage download."""
1187         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1188
1189     def report_extraction(self, video_id):
1190         """Report information extraction."""
1191         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1192
1193     def fetch_webpage(self, url):
1194         request = compat_urllib_request.Request(url)
1195         try:
1196             self.report_download_webpage(url)
1197             webpage = compat_urllib_request.urlopen(request).read()
1198         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1199             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1200             return
1201         except ValueError as err:
1202             self._downloader.report_error(u'Invalid URL: %s' % url)
1203             return
1204         return webpage
1205
1206     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1207         page = self.fetch_webpage(url)
1208         mobj = re.search(regex, page, regexFlags)
1209         info = {}
1210
1211         if mobj is None:
1212             self._downloader.report_error(u'Invalid URL: %s' % url)
1213             return
1214
1215         for (i, key, err) in matchTuples:
1216             if mobj.group(i) is None:
1217                 self._downloader.trouble(err)
1218                 return
1219             else:
1220                 info[key] = mobj.group(i)
1221
1222         return info
1223
1224     def extractLiveStream(self, url):
1225         video_lang = url.split('/')[-4]
1226         info = self.grep_webpage(
1227             url,
1228             r'src="(.*?/videothek_js.*?\.js)',
1229             0,
1230             [
1231                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1232             ]
1233         )
1234         http_host = url.split('/')[2]
1235         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239                 '(http://.*?\.swf).*?' +
1240                 '(rtmp://.*?)\'',
1241             re.DOTALL,
1242             [
1243                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1244                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1245                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1249
1250     def extractPlus7Stream(self, url):
1251         video_lang = url.split('/')[-3]
1252         info = self.grep_webpage(
1253             url,
1254             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1255             0,
1256             [
1257                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1258             ]
1259         )
1260         next_url = compat_urllib_parse.unquote(info.get('url'))
1261         info = self.grep_webpage(
1262             next_url,
1263             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1264             0,
1265             [
1266                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video id="(.*?)".*?>.*?' +
1274                 '<name>(.*?)</name>.*?' +
1275                 '<dateVideo>(.*?)</dateVideo>.*?' +
1276                 '<url quality="hd">(.*?)</url>',
1277             re.DOTALL,
1278             [
1279                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1280                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1281                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1282                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1283             ]
1284         )
1285
1286         return {
1287             'id':           info.get('id'),
1288             'url':          compat_urllib_parse.unquote(info.get('url')),
1289             'uploader':     u'arte.tv',
1290             'upload_date':  info.get('date'),
1291             'title':        info.get('title').decode('utf-8'),
1292             'ext':          u'mp4',
1293             'format':       u'NA',
1294             'player_url':   None,
1295         }
1296
1297     def _real_extract(self, url):
1298         video_id = url.split('/')[-1]
1299         self.report_extraction(video_id)
1300
1301         if re.search(self._LIVE_URL, video_id) is not None:
1302             self.extractLiveStream(url)
1303             return
1304         else:
1305             info = self.extractPlus7Stream(url)
1306
1307         return [info]
1308
1309
1310 class GenericIE(InfoExtractor):
1311     """Generic last-resort information extractor."""
1312
1313     _VALID_URL = r'.*'
1314     IE_NAME = u'generic'
1315
1316     def __init__(self, downloader=None):
1317         InfoExtractor.__init__(self, downloader)
1318
1319     def report_download_webpage(self, video_id):
1320         """Report webpage download."""
1321         if not self._downloader.params.get('test', False):
1322             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1323         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1324
1325     def report_extraction(self, video_id):
1326         """Report information extraction."""
1327         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1328
1329     def report_following_redirect(self, new_url):
1330         """Report information extraction."""
1331         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1332
1333     def _test_redirect(self, url):
1334         """Check if it is a redirect, like url shorteners, in case restart chain."""
1335         class HeadRequest(compat_urllib_request.Request):
1336             def get_method(self):
1337                 return "HEAD"
1338
1339         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1340             """
1341             Subclass the HTTPRedirectHandler to make it use our
1342             HeadRequest also on the redirected URL
1343             """
1344             def redirect_request(self, req, fp, code, msg, headers, newurl):
1345                 if code in (301, 302, 303, 307):
1346                     newurl = newurl.replace(' ', '%20')
1347                     newheaders = dict((k,v) for k,v in req.headers.items()
1348                                       if k.lower() not in ("content-length", "content-type"))
1349                     return HeadRequest(newurl,
1350                                        headers=newheaders,
1351                                        origin_req_host=req.get_origin_req_host(),
1352                                        unverifiable=True)
1353                 else:
1354                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1355
1356         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1357             """
1358             Fallback to GET if HEAD is not allowed (405 HTTP error)
1359             """
1360             def http_error_405(self, req, fp, code, msg, headers):
1361                 fp.read()
1362                 fp.close()
1363
1364                 newheaders = dict((k,v) for k,v in req.headers.items()
1365                                   if k.lower() not in ("content-length", "content-type"))
1366                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1367                                                  headers=newheaders,
1368                                                  origin_req_host=req.get_origin_req_host(),
1369                                                  unverifiable=True))
1370
1371         # Build our opener
1372         opener = compat_urllib_request.OpenerDirector()
1373         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1374                         HTTPMethodFallback, HEADRedirectHandler,
1375                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1376             opener.add_handler(handler())
1377
1378         response = opener.open(HeadRequest(url))
1379         new_url = response.geturl()
1380
1381         if url == new_url:
1382             return False
1383
1384         self.report_following_redirect(new_url)
1385         self._downloader.download([new_url])
1386         return True
1387
1388     def _real_extract(self, url):
1389         if self._test_redirect(url): return
1390
1391         video_id = url.split('/')[-1]
1392         try:
1393             webpage = self._download_webpage(url, video_id)
1394         except ValueError as err:
1395             # since this is the last-resort InfoExtractor, if
1396             # this error is thrown, it'll be thrown here
1397             self._downloader.report_error(u'Invalid URL: %s' % url)
1398             return
1399
1400         self.report_extraction(video_id)
1401         # Start with something easy: JW Player in SWFObject
1402         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit
1405             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit: JWPlayer JS loader
1408             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             self._downloader.report_error(u'Invalid URL: %s' % url)
1411             return
1412
1413         # It's possible that one of the regexes
1414         # matched, but returned an empty group:
1415         if mobj.group(1) is None:
1416             self._downloader.report_error(u'Invalid URL: %s' % url)
1417             return
1418
1419         video_url = compat_urllib_parse.unquote(mobj.group(1))
1420         video_id = os.path.basename(video_url)
1421
1422         # here's a fun little line of code for you:
1423         video_extension = os.path.splitext(video_id)[1][1:]
1424         video_id = os.path.splitext(video_id)[0]
1425
1426         # it's tempting to parse this further, but you would
1427         # have to take into account all the variations like
1428         #   Video Title - Site Name
1429         #   Site Name | Video Title
1430         #   Video Title - Tagline | Site Name
1431         # and so on and so forth; it's just not practical
1432         mobj = re.search(r'<title>(.*)</title>', webpage)
1433         if mobj is None:
1434             self._downloader.report_error(u'unable to extract title')
1435             return
1436         video_title = mobj.group(1)
1437
1438         # video uploader is domain name
1439         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1440         if mobj is None:
1441             self._downloader.report_error(u'unable to extract title')
1442             return
1443         video_uploader = mobj.group(1)
1444
1445         return [{
1446             'id':       video_id,
1447             'url':      video_url,
1448             'uploader': video_uploader,
1449             'upload_date':  None,
1450             'title':    video_title,
1451             'ext':      video_extension,
1452         }]
1453
1454
1455 class YoutubeSearchIE(InfoExtractor):
1456     """Information Extractor for YouTube search queries."""
1457     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1458     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1459     _max_youtube_results = 1000
1460     IE_NAME = u'youtube:search'
1461
1462     def __init__(self, downloader=None):
1463         InfoExtractor.__init__(self, downloader)
1464
1465     def report_download_page(self, query, pagenum):
1466         """Report attempt to download search page with given number."""
1467         query = query.decode(preferredencoding())
1468         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1469
1470     def _real_extract(self, query):
1471         mobj = re.match(self._VALID_URL, query)
1472         if mobj is None:
1473             self._downloader.report_error(u'invalid search query "%s"' % query)
1474             return
1475
1476         prefix, query = query.split(':')
1477         prefix = prefix[8:]
1478         query = query.encode('utf-8')
1479         if prefix == '':
1480             self._download_n_results(query, 1)
1481             return
1482         elif prefix == 'all':
1483             self._download_n_results(query, self._max_youtube_results)
1484             return
1485         else:
1486             try:
1487                 n = int(prefix)
1488                 if n <= 0:
1489                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1490                     return
1491                 elif n > self._max_youtube_results:
1492                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1493                     n = self._max_youtube_results
1494                 self._download_n_results(query, n)
1495                 return
1496             except ValueError: # parsing prefix as integer fails
1497                 self._download_n_results(query, 1)
1498                 return
1499
1500     def _download_n_results(self, query, n):
1501         """Downloads a specified number of results for a query"""
1502
1503         video_ids = []
1504         pagenum = 0
1505         limit = n
1506
1507         while (50 * pagenum) < limit:
1508             self.report_download_page(query, pagenum+1)
1509             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1510             request = compat_urllib_request.Request(result_url)
1511             try:
1512                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1513             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1514                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1515                 return
1516             api_response = json.loads(data)['data']
1517
1518             if not 'items' in api_response:
1519                 self._downloader.trouble(u'[youtube] No video results')
1520                 return
1521
1522             new_ids = list(video['id'] for video in api_response['items'])
1523             video_ids += new_ids
1524
1525             limit = min(n, api_response['totalItems'])
1526             pagenum += 1
1527
1528         if len(video_ids) > n:
1529             video_ids = video_ids[:n]
1530         for id in video_ids:
1531             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1532         return
1533
1534
1535 class GoogleSearchIE(InfoExtractor):
1536     """Information Extractor for Google Video search queries."""
1537     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1538     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1539     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1540     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1541     _max_google_results = 1000
1542     IE_NAME = u'video.google:search'
1543
1544     def __init__(self, downloader=None):
1545         InfoExtractor.__init__(self, downloader)
1546
1547     def report_download_page(self, query, pagenum):
1548         """Report attempt to download playlist page with given number."""
1549         query = query.decode(preferredencoding())
1550         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1551
1552     def _real_extract(self, query):
1553         mobj = re.match(self._VALID_URL, query)
1554         if mobj is None:
1555             self._downloader.report_error(u'invalid search query "%s"' % query)
1556             return
1557
1558         prefix, query = query.split(':')
1559         prefix = prefix[8:]
1560         query = query.encode('utf-8')
1561         if prefix == '':
1562             self._download_n_results(query, 1)
1563             return
1564         elif prefix == 'all':
1565             self._download_n_results(query, self._max_google_results)
1566             return
1567         else:
1568             try:
1569                 n = int(prefix)
1570                 if n <= 0:
1571                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1572                     return
1573                 elif n > self._max_google_results:
1574                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1575                     n = self._max_google_results
1576                 self._download_n_results(query, n)
1577                 return
1578             except ValueError: # parsing prefix as integer fails
1579                 self._download_n_results(query, 1)
1580                 return
1581
1582     def _download_n_results(self, query, n):
1583         """Downloads a specified number of results for a query"""
1584
1585         video_ids = []
1586         pagenum = 0
1587
1588         while True:
1589             self.report_download_page(query, pagenum)
1590             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1591             request = compat_urllib_request.Request(result_url)
1592             try:
1593                 page = compat_urllib_request.urlopen(request).read()
1594             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1595                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1596                 return
1597
1598             # Extract video identifiers
1599             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1600                 video_id = mobj.group(1)
1601                 if video_id not in video_ids:
1602                     video_ids.append(video_id)
1603                     if len(video_ids) == n:
1604                         # Specified n videos reached
1605                         for id in video_ids:
1606                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1607                         return
1608
1609             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610                 for id in video_ids:
1611                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1612                 return
1613
1614             pagenum = pagenum + 1
1615
1616
1617 class YahooSearchIE(InfoExtractor):
1618     """Information Extractor for Yahoo! Video search queries."""
1619
1620     _WORKING = False
1621     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1622     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1623     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1624     _MORE_PAGES_INDICATOR = r'\s*Next'
1625     _max_yahoo_results = 1000
1626     IE_NAME = u'video.yahoo:search'
1627
1628     def __init__(self, downloader=None):
1629         InfoExtractor.__init__(self, downloader)
1630
1631     def report_download_page(self, query, pagenum):
1632         """Report attempt to download playlist page with given number."""
1633         query = query.decode(preferredencoding())
1634         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1635
1636     def _real_extract(self, query):
1637         mobj = re.match(self._VALID_URL, query)
1638         if mobj is None:
1639             self._downloader.report_error(u'invalid search query "%s"' % query)
1640             return
1641
1642         prefix, query = query.split(':')
1643         prefix = prefix[8:]
1644         query = query.encode('utf-8')
1645         if prefix == '':
1646             self._download_n_results(query, 1)
1647             return
1648         elif prefix == 'all':
1649             self._download_n_results(query, self._max_yahoo_results)
1650             return
1651         else:
1652             try:
1653                 n = int(prefix)
1654                 if n <= 0:
1655                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1656                     return
1657                 elif n > self._max_yahoo_results:
1658                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1659                     n = self._max_yahoo_results
1660                 self._download_n_results(query, n)
1661                 return
1662             except ValueError: # parsing prefix as integer fails
1663                 self._download_n_results(query, 1)
1664                 return
1665
1666     def _download_n_results(self, query, n):
1667         """Downloads a specified number of results for a query"""
1668
1669         video_ids = []
1670         already_seen = set()
1671         pagenum = 1
1672
1673         while True:
1674             self.report_download_page(query, pagenum)
1675             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1676             request = compat_urllib_request.Request(result_url)
1677             try:
1678                 page = compat_urllib_request.urlopen(request).read()
1679             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1680                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1681                 return
1682
1683             # Extract video identifiers
1684             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1685                 video_id = mobj.group(1)
1686                 if video_id not in already_seen:
1687                     video_ids.append(video_id)
1688                     already_seen.add(video_id)
1689                     if len(video_ids) == n:
1690                         # Specified n videos reached
1691                         for id in video_ids:
1692                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1693                         return
1694
1695             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1696                 for id in video_ids:
1697                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1698                 return
1699
1700             pagenum = pagenum + 1
1701
1702
1703 class YoutubePlaylistIE(InfoExtractor):
1704     """Information Extractor for YouTube playlists."""
1705
1706     _VALID_URL = r"""(?:
1707                         (?:https?://)?
1708                         (?:\w+\.)?
1709                         youtube\.com/
1710                         (?:
1711                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1712                            \? (?:.*?&)*? (?:p|a|list)=
1713                         |  user/.*?/user/
1714                         |  p/
1715                         |  user/.*?#[pg]/c/
1716                         )
1717                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1718                         .*
1719                      |
1720                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1721                      )"""
1722     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1723     _MAX_RESULTS = 50
1724     IE_NAME = u'youtube:playlist'
1725
1726     def __init__(self, downloader=None):
1727         InfoExtractor.__init__(self, downloader)
1728
1729     @classmethod
1730     def suitable(cls, url):
1731         """Receives a URL and returns True if suitable for this IE."""
1732         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1733
1734     def report_download_page(self, playlist_id, pagenum):
1735         """Report attempt to download playlist page with given number."""
1736         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1737
1738     def _real_extract(self, url):
1739         # Extract playlist id
1740         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1741         if mobj is None:
1742             self._downloader.report_error(u'invalid url: %s' % url)
1743             return
1744
1745         # Download playlist videos from API
1746         playlist_id = mobj.group(1) or mobj.group(2)
1747         page_num = 1
1748         videos = []
1749
1750         while True:
1751             self.report_download_page(playlist_id, page_num)
1752
1753             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1754             try:
1755                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1756             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1758                 return
1759
1760             try:
1761                 response = json.loads(page)
1762             except ValueError as err:
1763                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1764                 return
1765
1766             if not 'feed' in response or not 'entry' in response['feed']:
1767                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1768                 return
1769             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1770                         for entry in response['feed']['entry']
1771                         if 'content' in entry ]
1772
1773             if len(response['feed']['entry']) < self._MAX_RESULTS:
1774                 break
1775             page_num += 1
1776
1777         videos = [v[1] for v in sorted(videos)]
1778         total = len(videos)
1779
1780         playliststart = self._downloader.params.get('playliststart', 1) - 1
1781         playlistend = self._downloader.params.get('playlistend', -1)
1782         if playlistend == -1:
1783             videos = videos[playliststart:]
1784         else:
1785             videos = videos[playliststart:playlistend]
1786
1787         if len(videos) == total:
1788             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1789         else:
1790             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1791
1792         for video in videos:
1793             self._downloader.download([video])
1794         return
1795
1796
1797 class YoutubeChannelIE(InfoExtractor):
1798     """Information Extractor for YouTube channels."""
1799
1800     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1801     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1802     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1803     IE_NAME = u'youtube:channel'
1804
1805     def report_download_page(self, channel_id, pagenum):
1806         """Report attempt to download channel page with given number."""
1807         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1808
1809     def _real_extract(self, url):
1810         # Extract channel id
1811         mobj = re.match(self._VALID_URL, url)
1812         if mobj is None:
1813             self._downloader.report_error(u'invalid url: %s' % url)
1814             return
1815
1816         # Download channel pages
1817         channel_id = mobj.group(1)
1818         video_ids = []
1819         pagenum = 1
1820
1821         while True:
1822             self.report_download_page(channel_id, pagenum)
1823             url = self._TEMPLATE_URL % (channel_id, pagenum)
1824             request = compat_urllib_request.Request(url)
1825             try:
1826                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1827             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1829                 return
1830
1831             # Extract video identifiers
1832             ids_in_page = []
1833             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1834                 if mobj.group(1) not in ids_in_page:
1835                     ids_in_page.append(mobj.group(1))
1836             video_ids.extend(ids_in_page)
1837
1838             if self._MORE_PAGES_INDICATOR not in page:
1839                 break
1840             pagenum = pagenum + 1
1841
1842         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1843
1844         for id in video_ids:
1845             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1846         return
1847
1848
1849 class YoutubeUserIE(InfoExtractor):
1850     """Information Extractor for YouTube users."""
1851
1852     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1853     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1854     _GDATA_PAGE_SIZE = 50
1855     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1856     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1857     IE_NAME = u'youtube:user'
1858
1859     def __init__(self, downloader=None):
1860         InfoExtractor.__init__(self, downloader)
1861
1862     def report_download_page(self, username, start_index):
1863         """Report attempt to download user page."""
1864         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1865                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1866
1867     def _real_extract(self, url):
1868         # Extract username
1869         mobj = re.match(self._VALID_URL, url)
1870         if mobj is None:
1871             self._downloader.report_error(u'invalid url: %s' % url)
1872             return
1873
1874         username = mobj.group(1)
1875
1876         # Download video ids using YouTube Data API. Result size per
1877         # query is limited (currently to 50 videos) so we need to query
1878         # page by page until there are no video ids - it means we got
1879         # all of them.
1880
1881         video_ids = []
1882         pagenum = 0
1883
1884         while True:
1885             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1886             self.report_download_page(username, start_index)
1887
1888             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1889
1890             try:
1891                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1892             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1893                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1894                 return
1895
1896             # Extract video identifiers
1897             ids_in_page = []
1898
1899             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1900                 if mobj.group(1) not in ids_in_page:
1901                     ids_in_page.append(mobj.group(1))
1902
1903             video_ids.extend(ids_in_page)
1904
1905             # A little optimization - if current page is not
1906             # "full", ie. does not contain PAGE_SIZE video ids then
1907             # we can assume that this page is the last one - there
1908             # are no more ids on further pages - no need to query
1909             # again.
1910
1911             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1912                 break
1913
1914             pagenum += 1
1915
1916         all_ids_count = len(video_ids)
1917         playliststart = self._downloader.params.get('playliststart', 1) - 1
1918         playlistend = self._downloader.params.get('playlistend', -1)
1919
1920         if playlistend == -1:
1921             video_ids = video_ids[playliststart:]
1922         else:
1923             video_ids = video_ids[playliststart:playlistend]
1924
1925         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1926                 (username, all_ids_count, len(video_ids)))
1927
1928         for video_id in video_ids:
1929             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1930
1931
1932 class BlipTVUserIE(InfoExtractor):
1933     """Information Extractor for blip.tv users."""
1934
1935     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1936     _PAGE_SIZE = 12
1937     IE_NAME = u'blip.tv:user'
1938
1939     def __init__(self, downloader=None):
1940         InfoExtractor.__init__(self, downloader)
1941
1942     def report_download_page(self, username, pagenum):
1943         """Report attempt to download user page."""
1944         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1945                 (self.IE_NAME, username, pagenum))
1946
1947     def _real_extract(self, url):
1948         # Extract username
1949         mobj = re.match(self._VALID_URL, url)
1950         if mobj is None:
1951             self._downloader.report_error(u'invalid url: %s' % url)
1952             return
1953
1954         username = mobj.group(1)
1955
1956         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1957
1958         request = compat_urllib_request.Request(url)
1959
1960         try:
1961             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1962             mobj = re.search(r'data-users-id="([^"]+)"', page)
1963             page_base = page_base % mobj.group(1)
1964         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1965             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1966             return
1967
1968
1969         # Download video ids using BlipTV Ajax calls. Result size per
1970         # query is limited (currently to 12 videos) so we need to query
1971         # page by page until there are no video ids - it means we got
1972         # all of them.
1973
1974         video_ids = []
1975         pagenum = 1
1976
1977         while True:
1978             self.report_download_page(username, pagenum)
1979             url = page_base + "&page=" + str(pagenum)
1980             request = compat_urllib_request.Request( url )
1981             try:
1982                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1983             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1984                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1985                 return
1986
1987             # Extract video identifiers
1988             ids_in_page = []
1989
1990             for mobj in re.finditer(r'href="/([^"]+)"', page):
1991                 if mobj.group(1) not in ids_in_page:
1992                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1993
1994             video_ids.extend(ids_in_page)
1995
1996             # A little optimization - if current page is not
1997             # "full", ie. does not contain PAGE_SIZE video ids then
1998             # we can assume that this page is the last one - there
1999             # are no more ids on further pages - no need to query
2000             # again.
2001
2002             if len(ids_in_page) < self._PAGE_SIZE:
2003                 break
2004
2005             pagenum += 1
2006
2007         all_ids_count = len(video_ids)
2008         playliststart = self._downloader.params.get('playliststart', 1) - 1
2009         playlistend = self._downloader.params.get('playlistend', -1)
2010
2011         if playlistend == -1:
2012             video_ids = video_ids[playliststart:]
2013         else:
2014             video_ids = video_ids[playliststart:playlistend]
2015
2016         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2017                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2018
2019         for video_id in video_ids:
2020             self._downloader.download([u'http://blip.tv/'+video_id])
2021
2022
2023 class DepositFilesIE(InfoExtractor):
2024     """Information extractor for depositfiles.com"""
2025
2026     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2027
2028     def report_download_webpage(self, file_id):
2029         """Report webpage download."""
2030         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2031
2032     def report_extraction(self, file_id):
2033         """Report information extraction."""
2034         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2035
2036     def _real_extract(self, url):
2037         file_id = url.split('/')[-1]
2038         # Rebuild url in english locale
2039         url = 'http://depositfiles.com/en/files/' + file_id
2040
2041         # Retrieve file webpage with 'Free download' button pressed
2042         free_download_indication = { 'gateway_result' : '1' }
2043         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2044         try:
2045             self.report_download_webpage(file_id)
2046             webpage = compat_urllib_request.urlopen(request).read()
2047         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2048             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2049             return
2050
2051         # Search for the real file URL
2052         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2053         if (mobj is None) or (mobj.group(1) is None):
2054             # Try to figure out reason of the error.
2055             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2056             if (mobj is not None) and (mobj.group(1) is not None):
2057                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2058                 self._downloader.report_error(u'%s' % restriction_message)
2059             else:
2060                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2061             return
2062
2063         file_url = mobj.group(1)
2064         file_extension = os.path.splitext(file_url)[1][1:]
2065
2066         # Search for file title
2067         mobj = re.search(r'<b title="(.*?)">', webpage)
2068         if mobj is None:
2069             self._downloader.report_error(u'unable to extract title')
2070             return
2071         file_title = mobj.group(1).decode('utf-8')
2072
2073         return [{
2074             'id':       file_id.decode('utf-8'),
2075             'url':      file_url.decode('utf-8'),
2076             'uploader': None,
2077             'upload_date':  None,
2078             'title':    file_title,
2079             'ext':      file_extension.decode('utf-8'),
2080         }]
2081
2082
2083 class FacebookIE(InfoExtractor):
2084     """Information Extractor for Facebook"""
2085
2086     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2087     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2088     _NETRC_MACHINE = 'facebook'
2089     IE_NAME = u'facebook'
2090
2091     def report_login(self):
2092         """Report attempt to log in."""
2093         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2094
2095     def _real_initialize(self):
2096         if self._downloader is None:
2097             return
2098
2099         useremail = None
2100         password = None
2101         downloader_params = self._downloader.params
2102
2103         # Attempt to use provided username and password or .netrc data
2104         if downloader_params.get('username', None) is not None:
2105             useremail = downloader_params['username']
2106             password = downloader_params['password']
2107         elif downloader_params.get('usenetrc', False):
2108             try:
2109                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2110                 if info is not None:
2111                     useremail = info[0]
2112                     password = info[2]
2113                 else:
2114                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2115             except (IOError, netrc.NetrcParseError) as err:
2116                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2117                 return
2118
2119         if useremail is None:
2120             return
2121
2122         # Log in
2123         login_form = {
2124             'email': useremail,
2125             'pass': password,
2126             'login': 'Log+In'
2127             }
2128         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2129         try:
2130             self.report_login()
2131             login_results = compat_urllib_request.urlopen(request).read()
2132             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2133                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2134                 return
2135         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2136             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2137             return
2138
2139     def _real_extract(self, url):
2140         mobj = re.match(self._VALID_URL, url)
2141         if mobj is None:
2142             self._downloader.report_error(u'invalid URL: %s' % url)
2143             return
2144         video_id = mobj.group('ID')
2145
2146         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2147         webpage = self._download_webpage(url, video_id)
2148
2149         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2150         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2151         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2152         if not m:
2153             raise ExtractorError(u'Cannot parse data')
2154         data = dict(json.loads(m.group(1)))
2155         params_raw = compat_urllib_parse.unquote(data['params'])
2156         params = json.loads(params_raw)
2157         video_url = params['hd_src']
2158         if not video_url:
2159             video_url = params['sd_src']
2160         if not video_url:
2161             raise ExtractorError(u'Cannot find video URL')
2162         video_duration = int(params['video_duration'])
2163
2164         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2165         if not m:
2166             raise ExtractorError(u'Cannot find title in webpage')
2167         video_title = unescapeHTML(m.group(1))
2168
2169         info = {
2170             'id': video_id,
2171             'title': video_title,
2172             'url': video_url,
2173             'ext': 'mp4',
2174             'duration': video_duration,
2175             'thumbnail': params['thumbnail_src'],
2176         }
2177         return [info]
2178
2179
2180 class BlipTVIE(InfoExtractor):
2181     """Information extractor for blip.tv"""
2182
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185     IE_NAME = u'blip.tv'
2186
2187     def report_extraction(self, file_id):
2188         """Report information extraction."""
2189         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191     def report_direct_download(self, title):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195     def _real_extract(self, url):
2196         mobj = re.match(self._VALID_URL, url)
2197         if mobj is None:
2198             self._downloader.report_error(u'invalid URL: %s' % url)
2199             return
2200
2201         urlp = compat_urllib_parse_urlparse(url)
2202         if urlp.path.startswith('/play/'):
2203             request = compat_urllib_request.Request(url)
2204             response = compat_urllib_request.urlopen(request)
2205             redirecturl = response.geturl()
2206             rurlp = compat_urllib_parse_urlparse(redirecturl)
2207             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2208             url = 'http://blip.tv/a/a-' + file_id
2209             return self._real_extract(url)
2210
2211
2212         if '?' in url:
2213             cchar = '&'
2214         else:
2215             cchar = '?'
2216         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2217         request = compat_urllib_request.Request(json_url)
2218         request.add_header('User-Agent', 'iTunes/10.6.1')
2219         self.report_extraction(mobj.group(1))
2220         info = None
2221         try:
2222             urlh = compat_urllib_request.urlopen(request)
2223             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2224                 basename = url.split('/')[-1]
2225                 title,ext = os.path.splitext(basename)
2226                 title = title.decode('UTF-8')
2227                 ext = ext.replace('.', '')
2228                 self.report_direct_download(title)
2229                 info = {
2230                     'id': title,
2231                     'url': url,
2232                     'uploader': None,
2233                     'upload_date': None,
2234                     'title': title,
2235                     'ext': ext,
2236                     'urlhandle': urlh
2237                 }
2238         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2239             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2240         if info is None: # Regular URL
2241             try:
2242                 json_code_bytes = urlh.read()
2243                 json_code = json_code_bytes.decode('utf-8')
2244             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2245                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2246                 return
2247
2248             try:
2249                 json_data = json.loads(json_code)
2250                 if 'Post' in json_data:
2251                     data = json_data['Post']
2252                 else:
2253                     data = json_data
2254
2255                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2256                 video_url = data['media']['url']
2257                 umobj = re.match(self._URL_EXT, video_url)
2258                 if umobj is None:
2259                     raise ValueError('Can not determine filename extension')
2260                 ext = umobj.group(1)
2261
2262                 info = {
2263                     'id': data['item_id'],
2264                     'url': video_url,
2265                     'uploader': data['display_name'],
2266                     'upload_date': upload_date,
2267                     'title': data['title'],
2268                     'ext': ext,
2269                     'format': data['media']['mimeType'],
2270                     'thumbnail': data['thumbnailUrl'],
2271                     'description': data['description'],
2272                     'player_url': data['embedUrl'],
2273                     'user_agent': 'iTunes/10.6.1',
2274                 }
2275             except (ValueError,KeyError) as err:
2276                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2277                 return
2278
2279         return [info]
2280
2281
2282 class MyVideoIE(InfoExtractor):
2283     """Information Extractor for myvideo.de."""
2284
2285     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2286     IE_NAME = u'myvideo'
2287
2288     def __init__(self, downloader=None):
2289         InfoExtractor.__init__(self, downloader)
2290
2291     def report_extraction(self, video_id):
2292         """Report information extraction."""
2293         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2294
2295     def _real_extract(self,url):
2296         mobj = re.match(self._VALID_URL, url)
2297         if mobj is None:
2298             self._download.report_error(u'invalid URL: %s' % url)
2299             return
2300
2301         video_id = mobj.group(1)
2302
2303         # Get video webpage
2304         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2305         webpage = self._download_webpage(webpage_url, video_id)
2306
2307         self.report_extraction(video_id)
2308         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2309                  webpage)
2310         if mobj is None:
2311             self._downloader.report_error(u'unable to extract media URL')
2312             return
2313         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2314
2315         mobj = re.search('<title>([^<]+)</title>', webpage)
2316         if mobj is None:
2317             self._downloader.report_error(u'unable to extract title')
2318             return
2319
2320         video_title = mobj.group(1)
2321
2322         return [{
2323             'id':       video_id,
2324             'url':      video_url,
2325             'uploader': None,
2326             'upload_date':  None,
2327             'title':    video_title,
2328             'ext':      u'flv',
2329         }]
2330
2331 class ComedyCentralIE(InfoExtractor):
2332     """Information extractor for The Daily Show and Colbert Report """
2333
2334     # urls can be abbreviations like :thedailyshow or :colbert
2335     # urls for episodes like:
2336     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2337     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2338     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2339     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2340                       |(https?://)?(www\.)?
2341                           (?P<showname>thedailyshow|colbertnation)\.com/
2342                          (full-episodes/(?P<episode>.*)|
2343                           (?P<clip>
2344                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2345                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2346                      $"""
2347
2348     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349
2350     _video_extensions = {
2351         '3500': 'mp4',
2352         '2200': 'mp4',
2353         '1700': 'mp4',
2354         '1200': 'mp4',
2355         '750': 'mp4',
2356         '400': 'mp4',
2357     }
2358     _video_dimensions = {
2359         '3500': '1280x720',
2360         '2200': '960x540',
2361         '1700': '768x432',
2362         '1200': '640x360',
2363         '750': '512x288',
2364         '400': '384x216',
2365     }
2366
2367     @classmethod
2368     def suitable(cls, url):
2369         """Receives a URL and returns True if suitable for this IE."""
2370         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2371
2372     def report_extraction(self, episode_id):
2373         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2374
2375     def report_config_download(self, episode_id, media_id):
2376         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2377
2378     def report_index_download(self, episode_id):
2379         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2380
2381     def _print_formats(self, formats):
2382         print('Available formats:')
2383         for x in formats:
2384             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2385
2386
2387     def _real_extract(self, url):
2388         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2389         if mobj is None:
2390             self._downloader.report_error(u'invalid URL: %s' % url)
2391             return
2392
2393         if mobj.group('shortname'):
2394             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2395                 url = u'http://www.thedailyshow.com/full-episodes/'
2396             else:
2397                 url = u'http://www.colbertnation.com/full-episodes/'
2398             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2399             assert mobj is not None
2400
2401         if mobj.group('clip'):
2402             if mobj.group('showname') == 'thedailyshow':
2403                 epTitle = mobj.group('tdstitle')
2404             else:
2405                 epTitle = mobj.group('cntitle')
2406             dlNewest = False
2407         else:
2408             dlNewest = not mobj.group('episode')
2409             if dlNewest:
2410                 epTitle = mobj.group('showname')
2411             else:
2412                 epTitle = mobj.group('episode')
2413
2414         req = compat_urllib_request.Request(url)
2415         self.report_extraction(epTitle)
2416         try:
2417             htmlHandle = compat_urllib_request.urlopen(req)
2418             html = htmlHandle.read()
2419             webpage = html.decode('utf-8')
2420         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2422             return
2423         if dlNewest:
2424             url = htmlHandle.geturl()
2425             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2426             if mobj is None:
2427                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2428                 return
2429             if mobj.group('episode') == '':
2430                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2431                 return
2432             epTitle = mobj.group('episode')
2433
2434         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2435
2436         if len(mMovieParams) == 0:
2437             # The Colbert Report embeds the information in a without
2438             # a URL prefix; so extract the alternate reference
2439             # and then add the URL prefix manually.
2440
2441             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2442             if len(altMovieParams) == 0:
2443                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2444                 return
2445             else:
2446                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2447
2448         uri = mMovieParams[0][1]
2449         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2450         self.report_index_download(epTitle)
2451         try:
2452             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2455             return
2456
2457         results = []
2458
2459         idoc = xml.etree.ElementTree.fromstring(indexXml)
2460         itemEls = idoc.findall('.//item')
2461         for partNum,itemEl in enumerate(itemEls):
2462             mediaId = itemEl.findall('./guid')[0].text
2463             shortMediaId = mediaId.split(':')[-1]
2464             showId = mediaId.split(':')[-2].replace('.com', '')
2465             officialTitle = itemEl.findall('./title')[0].text
2466             officialDate = itemEl.findall('./pubDate')[0].text
2467
2468             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2469                         compat_urllib_parse.urlencode({'uri': mediaId}))
2470             configReq = compat_urllib_request.Request(configUrl)
2471             self.report_config_download(epTitle, shortMediaId)
2472             try:
2473                 configXml = compat_urllib_request.urlopen(configReq).read()
2474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2476                 return
2477
2478             cdoc = xml.etree.ElementTree.fromstring(configXml)
2479             turls = []
2480             for rendition in cdoc.findall('.//rendition'):
2481                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2482                 turls.append(finfo)
2483
2484             if len(turls) == 0:
2485                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2486                 continue
2487
2488             if self._downloader.params.get('listformats', None):
2489                 self._print_formats([i[0] for i in turls])
2490                 return
2491
2492             # For now, just pick the highest bitrate
2493             format,rtmp_video_url = turls[-1]
2494
2495             # Get the format arg from the arg stream
2496             req_format = self._downloader.params.get('format', None)
2497
2498             # Select format if we can find one
2499             for f,v in turls:
2500                 if f == req_format:
2501                     format, rtmp_video_url = f, v
2502                     break
2503
2504             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2505             if not m:
2506                 raise ExtractorError(u'Cannot transform RTMP url')
2507             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2508             video_url = base + m.group('finalid')
2509
2510             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2511             info = {
2512                 'id': shortMediaId,
2513                 'url': video_url,
2514                 'uploader': showId,
2515                 'upload_date': officialDate,
2516                 'title': effTitle,
2517                 'ext': 'mp4',
2518                 'format': format,
2519                 'thumbnail': None,
2520                 'description': officialTitle,
2521             }
2522             results.append(info)
2523
2524         return results
2525
2526
2527 class EscapistIE(InfoExtractor):
2528     """Information extractor for The Escapist """
2529
2530     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2531     IE_NAME = u'escapist'
2532
2533     def report_extraction(self, showName):
2534         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2535
2536     def report_config_download(self, showName):
2537         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2538
2539     def _real_extract(self, url):
2540         mobj = re.match(self._VALID_URL, url)
2541         if mobj is None:
2542             self._downloader.report_error(u'invalid URL: %s' % url)
2543             return
2544         showName = mobj.group('showname')
2545         videoId = mobj.group('episode')
2546
2547         self.report_extraction(showName)
2548         try:
2549             webPage = compat_urllib_request.urlopen(url)
2550             webPageBytes = webPage.read()
2551             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2552             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2553         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2555             return
2556
2557         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2558         description = unescapeHTML(descMatch.group(1))
2559         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2560         imgUrl = unescapeHTML(imgMatch.group(1))
2561         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2562         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2563         configUrlMatch = re.search('config=(.*)$', playerUrl)
2564         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2565
2566         self.report_config_download(showName)
2567         try:
2568             configJSON = compat_urllib_request.urlopen(configUrl)
2569             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2570             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2571         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2572             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2573             return
2574
2575         # Technically, it's JavaScript, not JSON
2576         configJSON = configJSON.replace("'", '"')
2577
2578         try:
2579             config = json.loads(configJSON)
2580         except (ValueError,) as err:
2581             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2582             return
2583
2584         playlist = config['playlist']
2585         videoUrl = playlist[1]['url']
2586
2587         info = {
2588             'id': videoId,
2589             'url': videoUrl,
2590             'uploader': showName,
2591             'upload_date': None,
2592             'title': showName,
2593             'ext': 'mp4',
2594             'thumbnail': imgUrl,
2595             'description': description,
2596             'player_url': playerUrl,
2597         }
2598
2599         return [info]
2600
2601 class CollegeHumorIE(InfoExtractor):
2602     """Information extractor for collegehumor.com"""
2603
2604     _WORKING = False
2605     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2606     IE_NAME = u'collegehumor'
2607
2608     def report_manifest(self, video_id):
2609         """Report information extraction."""
2610         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2611
2612     def report_extraction(self, video_id):
2613         """Report information extraction."""
2614         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2615
2616     def _real_extract(self, url):
2617         mobj = re.match(self._VALID_URL, url)
2618         if mobj is None:
2619             self._downloader.report_error(u'invalid URL: %s' % url)
2620             return
2621         video_id = mobj.group('videoid')
2622
2623         info = {
2624             'id': video_id,
2625             'uploader': None,
2626             'upload_date': None,
2627         }
2628
2629         self.report_extraction(video_id)
2630         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2631         try:
2632             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2633         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2634             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2635             return
2636
2637         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2638         try:
2639             videoNode = mdoc.findall('./video')[0]
2640             info['description'] = videoNode.findall('./description')[0].text
2641             info['title'] = videoNode.findall('./caption')[0].text
2642             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2643             manifest_url = videoNode.findall('./file')[0].text
2644         except IndexError:
2645             self._downloader.report_error(u'Invalid metadata XML file')
2646             return
2647
2648         manifest_url += '?hdcore=2.10.3'
2649         self.report_manifest(video_id)
2650         try:
2651             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2652         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2654             return
2655
2656         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2657         try:
2658             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2659             node_id = media_node.attrib['url']
2660             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2661         except IndexError as err:
2662             self._downloader.report_error(u'Invalid manifest file')
2663             return
2664
2665         url_pr = compat_urllib_parse_urlparse(manifest_url)
2666         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2667
2668         info['url'] = url
2669         info['ext'] = 'f4f'
2670         return [info]
2671
2672
2673 class XVideosIE(InfoExtractor):
2674     """Information extractor for xvideos.com"""
2675
2676     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2677     IE_NAME = u'xvideos'
2678
2679     def report_extraction(self, video_id):
2680         """Report information extraction."""
2681         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2682
2683     def _real_extract(self, url):
2684         mobj = re.match(self._VALID_URL, url)
2685         if mobj is None:
2686             self._downloader.report_error(u'invalid URL: %s' % url)
2687             return
2688         video_id = mobj.group(1)
2689
2690         webpage = self._download_webpage(url, video_id)
2691
2692         self.report_extraction(video_id)
2693
2694
2695         # Extract video URL
2696         mobj = re.search(r'flv_url=(.+?)&', webpage)
2697         if mobj is None:
2698             self._downloader.report_error(u'unable to extract video url')
2699             return
2700         video_url = compat_urllib_parse.unquote(mobj.group(1))
2701
2702
2703         # Extract title
2704         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2705         if mobj is None:
2706             self._downloader.report_error(u'unable to extract video title')
2707             return
2708         video_title = mobj.group(1)
2709
2710
2711         # Extract video thumbnail
2712         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2713         if mobj is None:
2714             self._downloader.report_error(u'unable to extract video thumbnail')
2715             return
2716         video_thumbnail = mobj.group(0)
2717
2718         info = {
2719             'id': video_id,
2720             'url': video_url,
2721             'uploader': None,
2722             'upload_date': None,
2723             'title': video_title,
2724             'ext': 'flv',
2725             'thumbnail': video_thumbnail,
2726             'description': None,
2727         }
2728
2729         return [info]
2730
2731
2732 class SoundcloudIE(InfoExtractor):
2733     """Information extractor for soundcloud.com
2734        To access the media, the uid of the song and a stream token
2735        must be extracted from the page source and the script must make
2736        a request to media.soundcloud.com/crossdomain.xml. Then
2737        the media can be grabbed by requesting from an url composed
2738        of the stream token and uid
2739      """
2740
2741     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2742     IE_NAME = u'soundcloud'
2743
2744     def __init__(self, downloader=None):
2745         InfoExtractor.__init__(self, downloader)
2746
2747     def report_resolve(self, video_id):
2748         """Report information extraction."""
2749         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2750
2751     def report_extraction(self, video_id):
2752         """Report information extraction."""
2753         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2754
2755     def _real_extract(self, url):
2756         mobj = re.match(self._VALID_URL, url)
2757         if mobj is None:
2758             self._downloader.report_error(u'invalid URL: %s' % url)
2759             return
2760
2761         # extract uploader (which is in the url)
2762         uploader = mobj.group(1)
2763         # extract simple title (uploader + slug of song title)
2764         slug_title =  mobj.group(2)
2765         simple_title = uploader + u'-' + slug_title
2766
2767         self.report_resolve('%s/%s' % (uploader, slug_title))
2768
2769         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2770         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2771         request = compat_urllib_request.Request(resolv_url)
2772         try:
2773             info_json_bytes = compat_urllib_request.urlopen(request).read()
2774             info_json = info_json_bytes.decode('utf-8')
2775         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2777             return
2778
2779         info = json.loads(info_json)
2780         video_id = info['id']
2781         self.report_extraction('%s/%s' % (uploader, slug_title))
2782
2783         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2784         request = compat_urllib_request.Request(streams_url)
2785         try:
2786             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2787             stream_json = stream_json_bytes.decode('utf-8')
2788         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2789             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2790             return
2791
2792         streams = json.loads(stream_json)
2793         mediaURL = streams['http_mp3_128_url']
2794
2795         return [{
2796             'id':       info['id'],
2797             'url':      mediaURL,
2798             'uploader': info['user']['username'],
2799             'upload_date':  info['created_at'],
2800             'title':    info['title'],
2801             'ext':      u'mp3',
2802             'description': info['description'],
2803         }]
2804
2805 class SoundcloudSetIE(InfoExtractor):
2806     """Information extractor for soundcloud.com sets
2807        To access the media, the uid of the song and a stream token
2808        must be extracted from the page source and the script must make
2809        a request to media.soundcloud.com/crossdomain.xml. Then
2810        the media can be grabbed by requesting from an url composed
2811        of the stream token and uid
2812      """
2813
2814     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2815     IE_NAME = u'soundcloud'
2816
2817     def __init__(self, downloader=None):
2818         InfoExtractor.__init__(self, downloader)
2819
2820     def report_resolve(self, video_id):
2821         """Report information extraction."""
2822         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2823
2824     def report_extraction(self, video_id):
2825         """Report information extraction."""
2826         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2827
2828     def _real_extract(self, url):
2829         mobj = re.match(self._VALID_URL, url)
2830         if mobj is None:
2831             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2832             return
2833
2834         # extract uploader (which is in the url)
2835         uploader = mobj.group(1)
2836         # extract simple title (uploader + slug of song title)
2837         slug_title =  mobj.group(2)
2838         simple_title = uploader + u'-' + slug_title
2839
2840         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2841
2842         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2843         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2844         request = compat_urllib_request.Request(resolv_url)
2845         try:
2846             info_json_bytes = compat_urllib_request.urlopen(request).read()
2847             info_json = info_json_bytes.decode('utf-8')
2848         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2849             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2850             return
2851
2852         videos = []
2853         info = json.loads(info_json)
2854         if 'errors' in info:
2855             for err in info['errors']:
2856                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2857             return
2858
2859         for track in info['tracks']:
2860             video_id = track['id']
2861             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2862
2863             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2864             request = compat_urllib_request.Request(streams_url)
2865             try:
2866                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2867                 stream_json = stream_json_bytes.decode('utf-8')
2868             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2870                 return
2871
2872             streams = json.loads(stream_json)
2873             mediaURL = streams['http_mp3_128_url']
2874
2875             videos.append({
2876                 'id':       video_id,
2877                 'url':      mediaURL,
2878                 'uploader': track['user']['username'],
2879                 'upload_date':  track['created_at'],
2880                 'title':    track['title'],
2881                 'ext':      u'mp3',
2882                 'description': track['description'],
2883             })
2884         return videos
2885
2886
2887 class InfoQIE(InfoExtractor):
2888     """Information extractor for infoq.com"""
2889     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2890
2891     def report_extraction(self, video_id):
2892         """Report information extraction."""
2893         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2894
2895     def _real_extract(self, url):
2896         mobj = re.match(self._VALID_URL, url)
2897         if mobj is None:
2898             self._downloader.report_error(u'invalid URL: %s' % url)
2899             return
2900
2901         webpage = self._download_webpage(url, video_id=url)
2902         self.report_extraction(url)
2903
2904         # Extract video URL
2905         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2906         if mobj is None:
2907             self._downloader.report_error(u'unable to extract video url')
2908             return
2909         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2910         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2911
2912         # Extract title
2913         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2914         if mobj is None:
2915             self._downloader.report_error(u'unable to extract video title')
2916             return
2917         video_title = mobj.group(1)
2918
2919         # Extract description
2920         video_description = u'No description available.'
2921         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2922         if mobj is not None:
2923             video_description = mobj.group(1)
2924
2925         video_filename = video_url.split('/')[-1]
2926         video_id, extension = video_filename.split('.')
2927
2928         info = {
2929             'id': video_id,
2930             'url': video_url,
2931             'uploader': None,
2932             'upload_date': None,
2933             'title': video_title,
2934             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2935             'thumbnail': None,
2936             'description': video_description,
2937         }
2938
2939         return [info]
2940
2941 class MixcloudIE(InfoExtractor):
2942     """Information extractor for www.mixcloud.com"""
2943
2944     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2945     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2946     IE_NAME = u'mixcloud'
2947
2948     def __init__(self, downloader=None):
2949         InfoExtractor.__init__(self, downloader)
2950
2951     def report_download_json(self, file_id):
2952         """Report JSON download."""
2953         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2954
2955     def report_extraction(self, file_id):
2956         """Report information extraction."""
2957         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2958
2959     def get_urls(self, jsonData, fmt, bitrate='best'):
2960         """Get urls from 'audio_formats' section in json"""
2961         file_url = None
2962         try:
2963             bitrate_list = jsonData[fmt]
2964             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2965                 bitrate = max(bitrate_list) # select highest
2966
2967             url_list = jsonData[fmt][bitrate]
2968         except TypeError: # we have no bitrate info.
2969             url_list = jsonData[fmt]
2970         return url_list
2971
2972     def check_urls(self, url_list):
2973         """Returns 1st active url from list"""
2974         for url in url_list:
2975             try:
2976                 compat_urllib_request.urlopen(url)
2977                 return url
2978             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2979                 url = None
2980
2981         return None
2982
2983     def _print_formats(self, formats):
2984         print('Available formats:')
2985         for fmt in formats.keys():
2986             for b in formats[fmt]:
2987                 try:
2988                     ext = formats[fmt][b][0]
2989                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2990                 except TypeError: # we have no bitrate info
2991                     ext = formats[fmt][0]
2992                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2993                     break
2994
2995     def _real_extract(self, url):
2996         mobj = re.match(self._VALID_URL, url)
2997         if mobj is None:
2998             self._downloader.report_error(u'invalid URL: %s' % url)
2999             return
3000         # extract uploader & filename from url
3001         uploader = mobj.group(1).decode('utf-8')
3002         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3003
3004         # construct API request
3005         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3006         # retrieve .json file with links to files
3007         request = compat_urllib_request.Request(file_url)
3008         try:
3009             self.report_download_json(file_url)
3010             jsonData = compat_urllib_request.urlopen(request).read()
3011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3013             return
3014
3015         # parse JSON
3016         json_data = json.loads(jsonData)
3017         player_url = json_data['player_swf_url']
3018         formats = dict(json_data['audio_formats'])
3019
3020         req_format = self._downloader.params.get('format', None)
3021         bitrate = None
3022
3023         if self._downloader.params.get('listformats', None):
3024             self._print_formats(formats)
3025             return
3026
3027         if req_format is None or req_format == 'best':
3028             for format_param in formats.keys():
3029                 url_list = self.get_urls(formats, format_param)
3030                 # check urls
3031                 file_url = self.check_urls(url_list)
3032                 if file_url is not None:
3033                     break # got it!
3034         else:
3035             if req_format not in formats:
3036                 self._downloader.report_error(u'format is not available')
3037                 return
3038
3039             url_list = self.get_urls(formats, req_format)
3040             file_url = self.check_urls(url_list)
3041             format_param = req_format
3042
3043         return [{
3044             'id': file_id.decode('utf-8'),
3045             'url': file_url.decode('utf-8'),
3046             'uploader': uploader.decode('utf-8'),
3047             'upload_date': None,
3048             'title': json_data['name'],
3049             'ext': file_url.split('.')[-1].decode('utf-8'),
3050             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3051             'thumbnail': json_data['thumbnail_url'],
3052             'description': json_data['description'],
3053             'player_url': player_url.decode('utf-8'),
3054         }]
3055
3056 class StanfordOpenClassroomIE(InfoExtractor):
3057     """Information extractor for Stanford's Open ClassRoom"""
3058
3059     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3060     IE_NAME = u'stanfordoc'
3061
3062     def report_download_webpage(self, objid):
3063         """Report information extraction."""
3064         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3065
3066     def report_extraction(self, video_id):
3067         """Report information extraction."""
3068         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3069
3070     def _real_extract(self, url):
3071         mobj = re.match(self._VALID_URL, url)
3072         if mobj is None:
3073             raise ExtractorError(u'Invalid URL: %s' % url)
3074
3075         if mobj.group('course') and mobj.group('video'): # A specific video
3076             course = mobj.group('course')
3077             video = mobj.group('video')
3078             info = {
3079                 'id': course + '_' + video,
3080                 'uploader': None,
3081                 'upload_date': None,
3082             }
3083
3084             self.report_extraction(info['id'])
3085             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3086             xmlUrl = baseUrl + video + '.xml'
3087             try:
3088                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3089             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3090                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3091                 return
3092             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3093             try:
3094                 info['title'] = mdoc.findall('./title')[0].text
3095                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3096             except IndexError:
3097                 self._downloader.report_error(u'Invalid metadata XML file')
3098                 return
3099             info['ext'] = info['url'].rpartition('.')[2]
3100             return [info]
3101         elif mobj.group('course'): # A course page
3102             course = mobj.group('course')
3103             info = {
3104                 'id': course,
3105                 'type': 'playlist',
3106                 'uploader': None,
3107                 'upload_date': None,
3108             }
3109
3110             coursepage = self._download_webpage(url, info['id'],
3111                                         note='Downloading course info page',
3112                                         errnote='Unable to download course info page')
3113
3114             m = re.search('<h1>([^<]+)</h1>', coursepage)
3115             if m:
3116                 info['title'] = unescapeHTML(m.group(1))
3117             else:
3118                 info['title'] = info['id']
3119
3120             m = re.search('<description>([^<]+)</description>', coursepage)
3121             if m:
3122                 info['description'] = unescapeHTML(m.group(1))
3123
3124             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3125             info['list'] = [
3126                 {
3127                     'type': 'reference',
3128                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3129                 }
3130                     for vpage in links]
3131             results = []
3132             for entry in info['list']:
3133                 assert entry['type'] == 'reference'
3134                 results += self.extract(entry['url'])
3135             return results
3136         else: # Root page
3137             info = {
3138                 'id': 'Stanford OpenClassroom',
3139                 'type': 'playlist',
3140                 'uploader': None,
3141                 'upload_date': None,
3142             }
3143
3144             self.report_download_webpage(info['id'])
3145             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3146             try:
3147                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3148             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3149                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3150                 return
3151
3152             info['title'] = info['id']
3153
3154             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3155             info['list'] = [
3156                 {
3157                     'type': 'reference',
3158                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3159                 }
3160                     for cpage in links]
3161
3162             results = []
3163             for entry in info['list']:
3164                 assert entry['type'] == 'reference'
3165                 results += self.extract(entry['url'])
3166             return results
3167
3168 class MTVIE(InfoExtractor):
3169     """Information extractor for MTV.com"""
3170
3171     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3172     IE_NAME = u'mtv'
3173
3174     def report_extraction(self, video_id):
3175         """Report information extraction."""
3176         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3177
3178     def _real_extract(self, url):
3179         mobj = re.match(self._VALID_URL, url)
3180         if mobj is None:
3181             self._downloader.report_error(u'invalid URL: %s' % url)
3182             return
3183         if not mobj.group('proto'):
3184             url = 'http://' + url
3185         video_id = mobj.group('videoid')
3186
3187         webpage = self._download_webpage(url, video_id)
3188
3189         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3190         if mobj is None:
3191             self._downloader.report_error(u'unable to extract song name')
3192             return
3193         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3194         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3195         if mobj is None:
3196             self._downloader.report_error(u'unable to extract performer')
3197             return
3198         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3199         video_title = performer + ' - ' + song_name
3200
3201         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3202         if mobj is None:
3203             self._downloader.report_error(u'unable to mtvn_uri')
3204             return
3205         mtvn_uri = mobj.group(1)
3206
3207         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3208         if mobj is None:
3209             self._downloader.report_error(u'unable to extract content id')
3210             return
3211         content_id = mobj.group(1)
3212
3213         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3214         self.report_extraction(video_id)
3215         request = compat_urllib_request.Request(videogen_url)
3216         try:
3217             metadataXml = compat_urllib_request.urlopen(request).read()
3218         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3219             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3220             return
3221
3222         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3223         renditions = mdoc.findall('.//rendition')
3224
3225         # For now, always pick the highest quality.
3226         rendition = renditions[-1]
3227
3228         try:
3229             _,_,ext = rendition.attrib['type'].partition('/')
3230             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3231             video_url = rendition.find('./src').text
3232         except KeyError:
3233             self._downloader.trouble('Invalid rendition field.')
3234             return
3235
3236         info = {
3237             'id': video_id,
3238             'url': video_url,
3239             'uploader': performer,
3240             'upload_date': None,
3241             'title': video_title,
3242             'ext': ext,
3243             'format': format,
3244         }
3245
3246         return [info]
3247
3248
3249 class YoukuIE(InfoExtractor):
3250     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3251
3252     def report_download_webpage(self, file_id):
3253         """Report webpage download."""
3254         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3255
3256     def report_extraction(self, file_id):
3257         """Report information extraction."""
3258         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3259
3260     def _gen_sid(self):
3261         nowTime = int(time.time() * 1000)
3262         random1 = random.randint(1000,1998)
3263         random2 = random.randint(1000,9999)
3264
3265         return "%d%d%d" %(nowTime,random1,random2)
3266
3267     def _get_file_ID_mix_string(self, seed):
3268         mixed = []
3269         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3270         seed = float(seed)
3271         for i in range(len(source)):
3272             seed  =  (seed * 211 + 30031 ) % 65536
3273             index  =  math.floor(seed / 65536 * len(source) )
3274             mixed.append(source[int(index)])
3275             source.remove(source[int(index)])
3276         #return ''.join(mixed)
3277         return mixed
3278
3279     def _get_file_id(self, fileId, seed):
3280         mixed = self._get_file_ID_mix_string(seed)
3281         ids = fileId.split('*')
3282         realId = []
3283         for ch in ids:
3284             if ch:
3285                 realId.append(mixed[int(ch)])
3286         return ''.join(realId)
3287
3288     def _real_extract(self, url):
3289         mobj = re.match(self._VALID_URL, url)
3290         if mobj is None:
3291             self._downloader.report_error(u'invalid URL: %s' % url)
3292             return
3293         video_id = mobj.group('ID')
3294
3295         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3296
3297         request = compat_urllib_request.Request(info_url, None, std_headers)
3298         try:
3299             self.report_download_webpage(video_id)
3300             jsondata = compat_urllib_request.urlopen(request).read()
3301         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3302             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3303             return
3304
3305         self.report_extraction(video_id)
3306         try:
3307             jsonstr = jsondata.decode('utf-8')
3308             config = json.loads(jsonstr)
3309
3310             video_title =  config['data'][0]['title']
3311             seed = config['data'][0]['seed']
3312
3313             format = self._downloader.params.get('format', None)
3314             supported_format = list(config['data'][0]['streamfileids'].keys())
3315
3316             if format is None or format == 'best':
3317                 if 'hd2' in supported_format:
3318                     format = 'hd2'
3319                 else:
3320                     format = 'flv'
3321                 ext = u'flv'
3322             elif format == 'worst':
3323                 format = 'mp4'
3324                 ext = u'mp4'
3325             else:
3326                 format = 'flv'
3327                 ext = u'flv'
3328
3329
3330             fileid = config['data'][0]['streamfileids'][format]
3331             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3332         except (UnicodeDecodeError, ValueError, KeyError):
3333             self._downloader.report_error(u'unable to extract info section')
3334             return
3335
3336         files_info=[]
3337         sid = self._gen_sid()
3338         fileid = self._get_file_id(fileid, seed)
3339
3340         #column 8,9 of fileid represent the segment number
3341         #fileid[7:9] should be changed
3342         for index, key in enumerate(keys):
3343
3344             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3345             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3346
3347             info = {
3348                 'id': '%s_part%02d' % (video_id, index),
3349                 'url': download_url,
3350                 'uploader': None,
3351                 'upload_date': None,
3352                 'title': video_title,
3353                 'ext': ext,
3354             }
3355             files_info.append(info)
3356
3357         return files_info
3358
3359
3360 class XNXXIE(InfoExtractor):
3361     """Information extractor for xnxx.com"""
3362
3363     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3364     IE_NAME = u'xnxx'
3365     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3366     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3367     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3368
3369     def report_webpage(self, video_id):
3370         """Report information extraction"""
3371         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3372
3373     def report_extraction(self, video_id):
3374         """Report information extraction"""
3375         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3376
3377     def _real_extract(self, url):
3378         mobj = re.match(self._VALID_URL, url)
3379         if mobj is None:
3380             self._downloader.report_error(u'invalid URL: %s' % url)
3381             return
3382         video_id = mobj.group(1)
3383
3384         self.report_webpage(video_id)
3385
3386         # Get webpage content
3387         try:
3388             webpage_bytes = compat_urllib_request.urlopen(url).read()
3389             webpage = webpage_bytes.decode('utf-8')
3390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3391             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3392             return
3393
3394         result = re.search(self.VIDEO_URL_RE, webpage)
3395         if result is None:
3396             self._downloader.report_error(u'unable to extract video url')
3397             return
3398         video_url = compat_urllib_parse.unquote(result.group(1))
3399
3400         result = re.search(self.VIDEO_TITLE_RE, webpage)
3401         if result is None:
3402             self._downloader.report_error(u'unable to extract video title')
3403             return
3404         video_title = result.group(1)
3405
3406         result = re.search(self.VIDEO_THUMB_RE, webpage)
3407         if result is None:
3408             self._downloader.report_error(u'unable to extract video thumbnail')
3409             return
3410         video_thumbnail = result.group(1)
3411
3412         return [{
3413             'id': video_id,
3414             'url': video_url,
3415             'uploader': None,
3416             'upload_date': None,
3417             'title': video_title,
3418             'ext': 'flv',
3419             'thumbnail': video_thumbnail,
3420             'description': None,
3421         }]
3422
3423
3424 class GooglePlusIE(InfoExtractor):
3425     """Information extractor for plus.google.com."""
3426
3427     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3428     IE_NAME = u'plus.google'
3429
3430     def __init__(self, downloader=None):
3431         InfoExtractor.__init__(self, downloader)
3432
3433     def report_extract_entry(self, url):
3434         """Report downloading extry"""
3435         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3436
3437     def report_date(self, upload_date):
3438         """Report downloading extry"""
3439         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3440
3441     def report_uploader(self, uploader):
3442         """Report downloading extry"""
3443         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3444
3445     def report_title(self, video_title):
3446         """Report downloading extry"""
3447         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3448
3449     def report_extract_vid_page(self, video_page):
3450         """Report information extraction."""
3451         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3452
3453     def _real_extract(self, url):
3454         # Extract id from URL
3455         mobj = re.match(self._VALID_URL, url)
3456         if mobj is None:
3457             self._downloader.report_error(u'Invalid URL: %s' % url)
3458             return
3459
3460         post_url = mobj.group(0)
3461         video_id = mobj.group(1)
3462
3463         video_extension = 'flv'
3464
3465         # Step 1, Retrieve post webpage to extract further information
3466         self.report_extract_entry(post_url)
3467         request = compat_urllib_request.Request(post_url)
3468         try:
3469             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3471             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3472             return
3473
3474         # Extract update date
3475         upload_date = None
3476         pattern = 'title="Timestamp">(.*?)</a>'
3477         mobj = re.search(pattern, webpage)
3478         if mobj:
3479             upload_date = mobj.group(1)
3480             # Convert timestring to a format suitable for filename
3481             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3482             upload_date = upload_date.strftime('%Y%m%d')
3483         self.report_date(upload_date)
3484
3485         # Extract uploader
3486         uploader = None
3487         pattern = r'rel\="author".*?>(.*?)</a>'
3488         mobj = re.search(pattern, webpage)
3489         if mobj:
3490             uploader = mobj.group(1)
3491         self.report_uploader(uploader)
3492
3493         # Extract title
3494         # Get the first line for title
3495         video_title = u'NA'
3496         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3497         mobj = re.search(pattern, webpage)
3498         if mobj:
3499             video_title = mobj.group(1)
3500         self.report_title(video_title)
3501
3502         # Step 2, Stimulate clicking the image box to launch video
3503         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3504         mobj = re.search(pattern, webpage)
3505         if mobj is None:
3506             self._downloader.report_error(u'unable to extract video page URL')
3507
3508         video_page = mobj.group(1)
3509         request = compat_urllib_request.Request(video_page)
3510         try:
3511             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3512         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3513             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3514             return
3515         self.report_extract_vid_page(video_page)
3516
3517
3518         # Extract video links on video page
3519         """Extract video links of all sizes"""
3520         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3521         mobj = re.findall(pattern, webpage)
3522         if len(mobj) == 0:
3523             self._downloader.report_error(u'unable to extract video links')
3524
3525         # Sort in resolution
3526         links = sorted(mobj)
3527
3528         # Choose the lowest of the sort, i.e. highest resolution
3529         video_url = links[-1]
3530         # Only get the url. The resolution part in the tuple has no use anymore
3531         video_url = video_url[-1]
3532         # Treat escaped \u0026 style hex
3533         try:
3534             video_url = video_url.decode("unicode_escape")
3535         except AttributeError: # Python 3
3536             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3537
3538
3539         return [{
3540             'id':       video_id,
3541             'url':      video_url,
3542             'uploader': uploader,
3543             'upload_date':  upload_date,
3544             'title':    video_title,
3545             'ext':      video_extension,
3546         }]
3547
3548 class NBAIE(InfoExtractor):
3549     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3550     IE_NAME = u'nba'
3551
3552     def _real_extract(self, url):
3553         mobj = re.match(self._VALID_URL, url)
3554         if mobj is None:
3555             self._downloader.report_error(u'invalid URL: %s' % url)
3556             return
3557
3558         video_id = mobj.group(1)
3559         if video_id.endswith('/index.html'):
3560             video_id = video_id[:-len('/index.html')]
3561
3562         webpage = self._download_webpage(url, video_id)
3563
3564         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3565         def _findProp(rexp, default=None):
3566             m = re.search(rexp, webpage)
3567             if m:
3568                 return unescapeHTML(m.group(1))
3569             else:
3570                 return default
3571
3572         shortened_video_id = video_id.rpartition('/')[2]
3573         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3574         info = {
3575             'id': shortened_video_id,
3576             'url': video_url,
3577             'ext': 'mp4',
3578             'title': title,
3579             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3580             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3581         }
3582         return [info]
3583
3584 class JustinTVIE(InfoExtractor):
3585     """Information extractor for justin.tv and twitch.tv"""
3586     # TODO: One broadcast may be split into multiple videos. The key
3587     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3588     # starts at 1 and increases. Can we treat all parts as one video?
3589
3590     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3591         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3592     _JUSTIN_PAGE_LIMIT = 100
3593     IE_NAME = u'justin.tv'
3594
3595     def report_extraction(self, file_id):
3596         """Report information extraction."""
3597         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3598
3599     def report_download_page(self, channel, offset):
3600         """Report attempt to download a single page of videos."""
3601         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3602                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3603
3604     # Return count of items, list of *valid* items
3605     def _parse_page(self, url):
3606         try:
3607             urlh = compat_urllib_request.urlopen(url)
3608             webpage_bytes = urlh.read()
3609             webpage = webpage_bytes.decode('utf-8', 'ignore')
3610         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3611             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3612             return
3613
3614         response = json.loads(webpage)
3615         if type(response) != list:
3616             error_text = response.get('error', 'unknown error')
3617             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3618             return
3619         info = []
3620         for clip in response:
3621             video_url = clip['video_file_url']
3622             if video_url:
3623                 video_extension = os.path.splitext(video_url)[1][1:]
3624                 video_date = re.sub('-', '', clip['start_time'][:10])
3625                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3626                 video_id = clip['id']
3627                 video_title = clip.get('title', video_id)
3628                 info.append({
3629                     'id': video_id,
3630                     'url': video_url,
3631                     'title': video_title,
3632                     'uploader': clip.get('channel_name', video_uploader_id),
3633                     'uploader_id': video_uploader_id,
3634                     'upload_date': video_date,
3635                     'ext': video_extension,
3636                 })
3637         return (len(response), info)
3638
3639     def _real_extract(self, url):
3640         mobj = re.match(self._VALID_URL, url)
3641         if mobj is None:
3642             self._downloader.report_error(u'invalid URL: %s' % url)
3643             return
3644
3645         api = 'http://api.justin.tv'
3646         video_id = mobj.group(mobj.lastindex)
3647         paged = False
3648         if mobj.lastindex == 1:
3649             paged = True
3650             api += '/channel/archives/%s.json'
3651         else:
3652             api += '/broadcast/by_archive/%s.json'
3653         api = api % (video_id,)
3654
3655         self.report_extraction(video_id)
3656
3657         info = []
3658         offset = 0
3659         limit = self._JUSTIN_PAGE_LIMIT
3660         while True:
3661             if paged:
3662                 self.report_download_page(video_id, offset)
3663             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3664             page_count, page_info = self._parse_page(page_url)
3665             info.extend(page_info)
3666             if not paged or page_count != limit:
3667                 break
3668             offset += limit
3669         return info
3670
3671 class FunnyOrDieIE(InfoExtractor):
3672     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3673
3674     def _real_extract(self, url):
3675         mobj = re.match(self._VALID_URL, url)
3676         if mobj is None:
3677             self._downloader.report_error(u'invalid URL: %s' % url)
3678             return
3679
3680         video_id = mobj.group('id')
3681         webpage = self._download_webpage(url, video_id)
3682
3683         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3684         if not m:
3685             self._downloader.report_error(u'unable to find video information')
3686         video_url = unescapeHTML(m.group('url'))
3687
3688         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3689         if not m:
3690             self._downloader.trouble(u'Cannot find video title')
3691         title = clean_html(m.group('title'))
3692
3693         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3694         if m:
3695             desc = unescapeHTML(m.group('desc'))
3696         else:
3697             desc = None
3698
3699         info = {
3700             'id': video_id,
3701             'url': video_url,
3702             'ext': 'mp4',
3703             'title': title,
3704             'description': desc,
3705         }
3706         return [info]
3707
3708 class SteamIE(InfoExtractor):
3709     _VALID_URL = r"""http://store.steampowered.com/
3710                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3711                 (?P<gameID>\d+)/?
3712                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3713                 """
3714
3715     @classmethod
3716     def suitable(cls, url):
3717         """Receives a URL and returns True if suitable for this IE."""
3718         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3719
3720     def _real_extract(self, url):
3721         m = re.match(self._VALID_URL, url, re.VERBOSE)
3722         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3723         gameID = m.group('gameID')
3724         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3725         webpage = self._download_webpage(videourl, gameID)
3726         mweb = re.finditer(urlRE, webpage)
3727         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3728         titles = re.finditer(namesRE, webpage)
3729         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3730         thumbs = re.finditer(thumbsRE, webpage)
3731         videos = []
3732         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3733             video_id = vid.group('videoID')
3734             title = vtitle.group('videoName')
3735             video_url = vid.group('videoURL')
3736             video_thumb = thumb.group('thumbnail')
3737             if not video_url:
3738                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3739             info = {
3740                 'id':video_id,
3741                 'url':video_url,
3742                 'ext': 'flv',
3743                 'title': unescapeHTML(title),
3744                 'thumbnail': video_thumb
3745                   }
3746             videos.append(info)
3747         return videos
3748
3749 class UstreamIE(InfoExtractor):
3750     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3751     IE_NAME = u'ustream'
3752
3753     def _real_extract(self, url):
3754         m = re.match(self._VALID_URL, url)
3755         video_id = m.group('videoID')
3756         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3757         webpage = self._download_webpage(url, video_id)
3758         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3759         title = m.group('title')
3760         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3761         uploader = m.group('uploader')
3762         info = {
3763                 'id':video_id,
3764                 'url':video_url,
3765                 'ext': 'flv',
3766                 'title': title,
3767                 'uploader': uploader
3768                   }
3769         return [info]
3770
3771 class WorldStarHipHopIE(InfoExtractor):
3772     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3773     IE_NAME = u'WorldStarHipHop'
3774
3775     def _real_extract(self, url):
3776         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3777
3778         webpage_src = compat_urllib_request.urlopen(url).read()
3779         webpage_src = webpage_src.decode('utf-8')
3780
3781         mobj = re.search(_src_url, webpage_src)
3782
3783         m = re.match(self._VALID_URL, url)
3784         video_id = m.group('id')
3785
3786         if mobj is not None:
3787             video_url = mobj.group()
3788             if 'mp4' in video_url:
3789                 ext = 'mp4'
3790             else:
3791                 ext = 'flv'
3792         else:
3793             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3794             return
3795
3796         _title = r"""<title>(.*)</title>"""
3797
3798         mobj = re.search(_title, webpage_src)
3799         
3800         if mobj is not None:
3801             title = mobj.group(1)
3802         else:
3803             title = 'World Start Hip Hop - %s' % time.ctime()
3804
3805         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3806         mobj = re.search(_thumbnail, webpage_src)
3807
3808         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3809         if mobj is not None:
3810             thumbnail = mobj.group(1)
3811         else:
3812             _title = r"""candytitles.*>(.*)</span>"""
3813             mobj = re.search(_title, webpage_src)
3814             if mobj is not None:
3815                 title = mobj.group(1)
3816             thumbnail = None
3817         
3818         results = [{
3819                     'id': video_id,
3820                     'url' : video_url,
3821                     'title' : title,
3822                     'thumbnail' : thumbnail,
3823                     'ext' : ext,
3824                     }]
3825         return results
3826
3827 class RBMARadioIE(InfoExtractor):
3828     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3829
3830     def _real_extract(self, url):
3831         m = re.match(self._VALID_URL, url)
3832         video_id = m.group('videoID')
3833
3834         webpage = self._download_webpage(url, video_id)
3835         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3836         if not m:
3837             raise ExtractorError(u'Cannot find metadata')
3838         json_data = m.group(1)
3839
3840         try:
3841             data = json.loads(json_data)
3842         except ValueError as e:
3843             raise ExtractorError(u'Invalid JSON: ' + str(e))
3844
3845         video_url = data['akamai_url'] + '&cbr=256'
3846         url_parts = compat_urllib_parse_urlparse(video_url)
3847         video_ext = url_parts.path.rpartition('.')[2]
3848         info = {
3849                 'id': video_id,
3850                 'url': video_url,
3851                 'ext': video_ext,
3852                 'title': data['title'],
3853                 'description': data.get('teaser_text'),
3854                 'location': data.get('country_of_origin'),
3855                 'uploader': data.get('host', {}).get('name'),
3856                 'uploader_id': data.get('host', {}).get('slug'),
3857                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3858                 'duration': data.get('duration'),
3859         }
3860         return [info]
3861
3862
3863 class YouPornIE(InfoExtractor):
3864     """Information extractor for youporn.com."""
3865     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3866
3867     def _print_formats(self, formats):
3868         """Print all available formats"""
3869         print(u'Available formats:')
3870         print(u'ext\t\tformat')
3871         print(u'---------------------------------')
3872         for format in formats:
3873             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3874
3875     def _specific(self, req_format, formats):
3876         for x in formats:
3877             if(x["format"]==req_format):
3878                 return x
3879         return None
3880
3881     def _real_extract(self, url):
3882         mobj = re.match(self._VALID_URL, url)
3883         if mobj is None:
3884             self._downloader.report_error(u'invalid URL: %s' % url)
3885             return
3886
3887         video_id = mobj.group('videoid')
3888
3889         req = compat_urllib_request.Request(url)
3890         req.add_header('Cookie', 'age_verified=1')
3891         webpage = self._download_webpage(req, video_id)
3892
3893         # Get the video title
3894         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3895         if result is None:
3896             raise ExtractorError(u'Unable to extract video title')
3897         video_title = result.group('title').strip()
3898
3899         # Get the video date
3900         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3901         if result is None:
3902             self._downloader.report_warning(u'unable to extract video date')
3903             upload_date = None
3904         else:
3905             upload_date = result.group('date').strip()
3906
3907         # Get the video uploader
3908         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3909         if result is None:
3910             self._downloader.report_warning(u'unable to extract uploader')
3911             video_uploader = None
3912         else:
3913             video_uploader = result.group('uploader').strip()
3914             video_uploader = clean_html( video_uploader )
3915
3916         # Get all of the formats available
3917         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3918         result = re.search(DOWNLOAD_LIST_RE, webpage)
3919         if result is None:
3920             raise ExtractorError(u'Unable to extract download list')
3921         download_list_html = result.group('download_list').strip()
3922
3923         # Get all of the links from the page
3924         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3925         links = re.findall(LINK_RE, download_list_html)
3926         if(len(links) == 0):
3927             raise ExtractorError(u'ERROR: no known formats available for video')
3928
3929         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3930
3931         formats = []
3932         for link in links:
3933
3934             # A link looks like this:
3935             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3936             # A path looks like this:
3937             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3938             video_url = unescapeHTML( link )
3939             path = compat_urllib_parse_urlparse( video_url ).path
3940             extension = os.path.splitext( path )[1][1:]
3941             format = path.split('/')[4].split('_')[:2]
3942             size = format[0]
3943             bitrate = format[1]
3944             format = "-".join( format )
3945             title = u'%s-%s-%s' % (video_title, size, bitrate)
3946
3947             formats.append({
3948                 'id': video_id,
3949                 'url': video_url,
3950                 'uploader': video_uploader,
3951                 'upload_date': upload_date,
3952                 'title': title,
3953                 'ext': extension,
3954                 'format': format,
3955                 'thumbnail': None,
3956                 'description': None,
3957                 'player_url': None
3958             })
3959
3960         if self._downloader.params.get('listformats', None):
3961             self._print_formats(formats)
3962             return
3963
3964         req_format = self._downloader.params.get('format', None)
3965         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3966
3967         if req_format is None or req_format == 'best':
3968             return [formats[0]]
3969         elif req_format == 'worst':
3970             return [formats[-1]]
3971         elif req_format in ('-1', 'all'):
3972             return formats
3973         else:
3974             format = self._specific( req_format, formats )
3975             if result is None:
3976                 self._downloader.report_error(u'requested format not available')
3977                 return
3978             return [format]
3979
3980
3981
3982 class PornotubeIE(InfoExtractor):
3983     """Information extractor for pornotube.com."""
3984     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3985
3986     def _real_extract(self, url):
3987         mobj = re.match(self._VALID_URL, url)
3988         if mobj is None:
3989             self._downloader.report_error(u'invalid URL: %s' % url)
3990             return
3991
3992         video_id = mobj.group('videoid')
3993         video_title = mobj.group('title')
3994
3995         # Get webpage content
3996         webpage = self._download_webpage(url, video_id)
3997
3998         # Get the video URL
3999         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4000         result = re.search(VIDEO_URL_RE, webpage)
4001         if result is None:
4002             self._downloader.report_error(u'unable to extract video url')
4003             return
4004         video_url = compat_urllib_parse.unquote(result.group('url'))
4005
4006         #Get the uploaded date
4007         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4008         result = re.search(VIDEO_UPLOADED_RE, webpage)
4009         if result is None:
4010             self._downloader.report_error(u'unable to extract video title')
4011             return
4012         upload_date = result.group('date')
4013
4014         info = {'id': video_id,
4015                 'url': video_url,
4016                 'uploader': None,
4017                 'upload_date': upload_date,
4018                 'title': video_title,
4019                 'ext': 'flv',
4020                 'format': 'flv'}
4021
4022         return [info]
4023
4024 class YouJizzIE(InfoExtractor):
4025     """Information extractor for youjizz.com."""
4026     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4027
4028     def _real_extract(self, url):
4029         mobj = re.match(self._VALID_URL, url)
4030         if mobj is None:
4031             self._downloader.report_error(u'invalid URL: %s' % url)
4032             return
4033
4034         video_id = mobj.group('videoid')
4035
4036         # Get webpage content
4037         webpage = self._download_webpage(url, video_id)
4038
4039         # Get the video title
4040         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4041         if result is None:
4042             raise ExtractorError(u'ERROR: unable to extract video title')
4043         video_title = result.group('title').strip()
4044
4045         # Get the embed page
4046         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4047         if result is None:
4048             raise ExtractorError(u'ERROR: unable to extract embed page')
4049
4050         embed_page_url = result.group(0).strip()
4051         video_id = result.group('videoid')
4052
4053         webpage = self._download_webpage(embed_page_url, video_id)
4054
4055         # Get the video URL
4056         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4057         if result is None:
4058             raise ExtractorError(u'ERROR: unable to extract video url')
4059         video_url = result.group('source')
4060
4061         info = {'id': video_id,
4062                 'url': video_url,
4063                 'title': video_title,
4064                 'ext': 'flv',
4065                 'format': 'flv',
4066                 'player_url': embed_page_url}
4067
4068         return [info]
4069
4070 class EightTracksIE(InfoExtractor):
4071     IE_NAME = '8tracks'
4072     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4073
4074     def _real_extract(self, url):
4075         mobj = re.match(self._VALID_URL, url)
4076         if mobj is None:
4077             raise ExtractorError(u'Invalid URL: %s' % url)
4078         playlist_id = mobj.group('id')
4079
4080         webpage = self._download_webpage(url, playlist_id)
4081
4082         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4083         if not m:
4084             raise ExtractorError(u'Cannot find trax information')
4085         json_like = m.group(1)
4086         data = json.loads(json_like)
4087
4088         session = str(random.randint(0, 1000000000))
4089         mix_id = data['id']
4090         track_count = data['tracks_count']
4091         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4092         next_url = first_url
4093         res = []
4094         for i in itertools.count():
4095             api_json = self._download_webpage(next_url, playlist_id,
4096                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4097                 errnote=u'Failed to download song information')
4098             api_data = json.loads(api_json)
4099             track_data = api_data[u'set']['track']
4100             info = {
4101                 'id': track_data['id'],
4102                 'url': track_data['track_file_stream_url'],
4103                 'title': track_data['performer'] + u' - ' + track_data['name'],
4104                 'raw_title': track_data['name'],
4105                 'uploader_id': data['user']['login'],
4106                 'ext': 'm4a',
4107             }
4108             res.append(info)
4109             if api_data['set']['at_last_track']:
4110                 break
4111             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4112         return res
4113
4114 class KeekIE(InfoExtractor):
4115     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4116     IE_NAME = u'keek'
4117
4118     def _real_extract(self, url):
4119         m = re.match(self._VALID_URL, url)
4120         video_id = m.group('videoID')
4121         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4122         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4123         webpage = self._download_webpage(url, video_id)
4124         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4125         title = unescapeHTML(m.group('title'))
4126         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4127         uploader = clean_html(m.group('uploader'))
4128         info = {
4129                 'id': video_id,
4130                 'url': video_url,
4131                 'ext': 'mp4',
4132                 'title': title,
4133                 'thumbnail': thumbnail,
4134                 'uploader': uploader
4135         }
4136         return [info]
4137
4138 class TEDIE(InfoExtractor):
4139     _VALID_URL=r'''http://www.ted.com/
4140                    (
4141                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4142                         |
4143                         ((?P<type_talk>talks)) # We have a simple talk
4144                    )
4145                    /(?P<name>\w+) # Here goes the name and then ".html"
4146                    '''
4147
4148     @classmethod
4149     def suitable(cls, url):
4150         """Receives a URL and returns True if suitable for this IE."""
4151         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4152
4153     def _real_extract(self, url):
4154         m=re.match(self._VALID_URL, url, re.VERBOSE)
4155         if m.group('type_talk'):
4156             return [self._talk_info(url)]
4157         else :
4158             playlist_id=m.group('playlist_id')
4159             name=m.group('name')
4160             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4161             return self._playlist_videos_info(url,name,playlist_id)
4162
4163     def _talk_video_link(self,mediaSlug):
4164         '''Returns the video link for that mediaSlug'''
4165         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4166
4167     def _playlist_videos_info(self,url,name,playlist_id=0):
4168         '''Returns the videos of the playlist'''
4169         video_RE=r'''
4170                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4171                      ([.\s]*?)data-playlist_item_id="(\d+)"
4172                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4173                      '''
4174         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4175         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4176         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4177         m_names=re.finditer(video_name_RE,webpage)
4178         info=[]
4179         for m_video, m_name in zip(m_videos,m_names):
4180             video_id=m_video.group('video_id')
4181             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4182             info.append(self._talk_info(talk_url,video_id))
4183         return info
4184
4185     def _talk_info(self, url, video_id=0):
4186         """Return the video for the talk in the url"""
4187         m=re.match(self._VALID_URL, url,re.VERBOSE)
4188         videoName=m.group('name')
4189         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4190         # If the url includes the language we get the title translated
4191         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4192         title=re.search(title_RE, webpage).group('title')
4193         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4194                         "id":(?P<videoID>[\d]+).*?
4195                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4196         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4197         thumb_match=re.search(thumb_RE,webpage)
4198         info_match=re.search(info_RE,webpage,re.VERBOSE)
4199         video_id=info_match.group('videoID')
4200         mediaSlug=info_match.group('mediaSlug')
4201         video_url=self._talk_video_link(mediaSlug)
4202         info = {
4203                 'id': video_id,
4204                 'url': video_url,
4205                 'ext': 'mp4',
4206                 'title': title,
4207                 'thumbnail': thumb_match.group('thumbnail')
4208                 }
4209         return info
4210
4211 class MySpassIE(InfoExtractor):
4212     _VALID_URL = r'http://www.myspass.de/.*'
4213
4214     def _real_extract(self, url):
4215         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4216
4217         # video id is the last path element of the URL
4218         # usually there is a trailing slash, so also try the second but last
4219         url_path = compat_urllib_parse_urlparse(url).path
4220         url_parent_path, video_id = os.path.split(url_path)
4221         if not video_id:
4222             _, video_id = os.path.split(url_parent_path)
4223
4224         # get metadata
4225         metadata_url = META_DATA_URL_TEMPLATE % video_id
4226         metadata_text = self._download_webpage(metadata_url, video_id)
4227         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4228
4229         # extract values from metadata
4230         url_flv_el = metadata.find('url_flv')
4231         if url_flv_el is None:
4232             self._downloader.report_error(u'unable to extract download url')
4233             return
4234         video_url = url_flv_el.text
4235         extension = os.path.splitext(video_url)[1][1:]
4236         title_el = metadata.find('title')
4237         if title_el is None:
4238             self._downloader.report_error(u'unable to extract title')
4239             return
4240         title = title_el.text
4241         format_id_el = metadata.find('format_id')
4242         if format_id_el is None:
4243             format = ext
4244         else:
4245             format = format_id_el.text
4246         description_el = metadata.find('description')
4247         if description_el is not None:
4248             description = description_el.text
4249         else:
4250             description = None
4251         imagePreview_el = metadata.find('imagePreview')
4252         if imagePreview_el is not None:
4253             thumbnail = imagePreview_el.text
4254         else:
4255             thumbnail = None
4256         info = {
4257             'id': video_id,
4258             'url': video_url,
4259             'title': title,
4260             'ext': extension,
4261             'format': format,
4262             'thumbnail': thumbnail,
4263             'description': description
4264         }
4265         return [info]
4266
4267 class SpiegelIE(InfoExtractor):
4268     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4269
4270     def _real_extract(self, url):
4271         m = re.match(self._VALID_URL, url)
4272         video_id = m.group('videoID')
4273
4274         webpage = self._download_webpage(url, video_id)
4275         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4276         if not m:
4277             raise ExtractorError(u'Cannot find title')
4278         video_title = unescapeHTML(m.group(1))
4279
4280         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4281         xml_code = self._download_webpage(xml_url, video_id,
4282                     note=u'Downloading XML', errnote=u'Failed to download XML')
4283
4284         idoc = xml.etree.ElementTree.fromstring(xml_code)
4285         last_type = idoc[-1]
4286         filename = last_type.findall('./filename')[0].text
4287         duration = float(last_type.findall('./duration')[0].text)
4288
4289         video_url = 'http://video2.spiegel.de/flash/' + filename
4290         video_ext = filename.rpartition('.')[2]
4291         info = {
4292             'id': video_id,
4293             'url': video_url,
4294             'ext': video_ext,
4295             'title': video_title,
4296             'duration': duration,
4297         }
4298         return [info]
4299
4300 class LiveLeakIE(InfoExtractor):
4301
4302     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4303     IE_NAME = u'liveleak'
4304
4305     def _real_extract(self, url):
4306         mobj = re.match(self._VALID_URL, url)
4307         if mobj is None:
4308             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4309             return
4310
4311         video_id = mobj.group('video_id')
4312
4313         webpage = self._download_webpage(url, video_id)
4314
4315         m = re.search(r'file: "(.*?)",', webpage)
4316         if not m:
4317             self._downloader.report_error(u'unable to find video url')
4318             return
4319         video_url = m.group(1)
4320
4321         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4322         if not m:
4323             self._downloader.trouble(u'Cannot find video title')
4324         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4325
4326         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4327         if m:
4328             desc = unescapeHTML(m.group('desc'))
4329         else:
4330             desc = None
4331
4332         m = re.search(r'By:.*?(\w+)</a>', webpage)
4333         if m:
4334             uploader = clean_html(m.group(1))
4335         else:
4336             uploader = None
4337
4338         info = {
4339             'id':  video_id,
4340             'url': video_url,
4341             'ext': 'mp4',
4342             'title': title,
4343             'description': desc,
4344             'uploader': uploader
4345         }
4346
4347         return [info]
4348
4349
4350 def gen_extractors():
4351     """ Return a list of an instance of every supported extractor.
4352     The order does matter; the first extractor matched is the one handling the URL.
4353     """
4354     return [
4355         YoutubePlaylistIE(),
4356         YoutubeChannelIE(),
4357         YoutubeUserIE(),
4358         YoutubeSearchIE(),
4359         YoutubeIE(),
4360         MetacafeIE(),
4361         DailymotionIE(),
4362         GoogleSearchIE(),
4363         PhotobucketIE(),
4364         YahooIE(),
4365         YahooSearchIE(),
4366         DepositFilesIE(),
4367         FacebookIE(),
4368         BlipTVUserIE(),
4369         BlipTVIE(),
4370         VimeoIE(),
4371         MyVideoIE(),
4372         ComedyCentralIE(),
4373         EscapistIE(),
4374         CollegeHumorIE(),
4375         XVideosIE(),
4376         SoundcloudSetIE(),
4377         SoundcloudIE(),
4378         InfoQIE(),
4379         MixcloudIE(),
4380         StanfordOpenClassroomIE(),
4381         MTVIE(),
4382         YoukuIE(),
4383         XNXXIE(),
4384         YouJizzIE(),
4385         PornotubeIE(),
4386         YouPornIE(),
4387         GooglePlusIE(),
4388         ArteTvIE(),
4389         NBAIE(),
4390         WorldStarHipHopIE(),
4391         JustinTVIE(),
4392         FunnyOrDieIE(),
4393         SteamIE(),
4394         UstreamIE(),
4395         RBMARadioIE(),
4396         EightTracksIE(),
4397         KeekIE(),
4398         TEDIE(),
4399         MySpassIE(),
4400         SpiegelIE(),
4401         LiveLeakIE(),
4402         GenericIE()
4403     ]