Fix crash when subtitles are not found
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
228
229     def report_video_subtitles_request(self, video_id, sub_lang, format):
230         """Report attempt to download video info webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
232
233     def report_video_subtitles_available(self, video_id, sub_lang_list):
234         """Report available subtitles."""
235         sub_lang = ",".join(list(sub_lang_list.keys()))
236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
237
238     def report_information_extraction(self, video_id):
239         """Report attempt to extract video information."""
240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
241
242     def report_unavailable_format(self, video_id, format):
243         """Report extracted video URL."""
244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
245
246     def report_rtmp_download(self):
247         """Indicate the download will use the RTMP protocol."""
248         self._downloader.to_screen(u'[youtube] RTMP download detected')
249
250     def _get_available_subtitles(self, video_id):
251         self.report_video_subtitles_download(video_id)
252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253         try:
254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259         if not sub_lang_list:
260             return (u'WARNING: video doesn\'t have subtitles', None)
261         return sub_lang_list
262
263     def _list_available_subtitles(self, video_id):
264         sub_lang_list = self._get_available_subtitles(video_id)
265         self.report_video_subtitles_available(video_id, sub_lang_list)
266
267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268         self.report_video_subtitles_request(video_id, sub_lang, format)
269         params = compat_urllib_parse.urlencode({
270             'lang': sub_lang,
271             'name': sub_name,
272             'v': video_id,
273             'fmt': format,
274         })
275         url = 'http://www.youtube.com/api/timedtext?' + params
276         try:
277             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
280         if not sub:
281             return (u'WARNING: Did not fetch video subtitles', None)
282         return (None, sub_lang, sub)
283
284     def _extract_subtitle(self, video_id):
285         """
286         Return a list with a tuple:
287         [(error_message, sub_lang, sub)]
288         """
289         sub_lang_list = self._get_available_subtitles(video_id)
290         sub_format = self._downloader.params.get('subtitlesformat')
291         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
292             return [(sub_lang_list[0], None, None)]
293         if self._downloader.params.get('subtitleslang', False):
294             sub_lang = self._downloader.params.get('subtitleslang')
295         elif 'en' in sub_lang_list:
296             sub_lang = 'en'
297         else:
298             sub_lang = list(sub_lang_list.keys())[0]
299         if not sub_lang in sub_lang_list:
300             return [(u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None, None)]
301
302         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
303         return [subtitle]
304
305     def _extract_all_subtitles(self, video_id):
306         sub_lang_list = self._get_available_subtitles(video_id)
307         sub_format = self._downloader.params.get('subtitlesformat')
308         subtitles = []
309         for sub_lang in sub_lang_list:
310             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
311             subtitles.append(subtitle)
312         return subtitles
313
314     def _print_formats(self, formats):
315         print('Available formats:')
316         for x in formats:
317             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
318
319     def _real_initialize(self):
320         if self._downloader is None:
321             return
322
323         username = None
324         password = None
325         downloader_params = self._downloader.params
326
327         # Attempt to use provided username and password or .netrc data
328         if downloader_params.get('username', None) is not None:
329             username = downloader_params['username']
330             password = downloader_params['password']
331         elif downloader_params.get('usenetrc', False):
332             try:
333                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
334                 if info is not None:
335                     username = info[0]
336                     password = info[2]
337                 else:
338                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
339             except (IOError, netrc.NetrcParseError) as err:
340                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
341                 return
342
343         # Set language
344         request = compat_urllib_request.Request(self._LANG_URL)
345         try:
346             self.report_lang()
347             compat_urllib_request.urlopen(request).read()
348         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
349             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
350             return
351
352         # No authentication to be performed
353         if username is None:
354             return
355
356         request = compat_urllib_request.Request(self._LOGIN_URL)
357         try:
358             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
359         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
360             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
361             return
362
363         galx = None
364         dsh = None
365         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
366         if match:
367           galx = match.group(1)
368
369         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
370         if match:
371           dsh = match.group(1)
372
373         # Log in
374         login_form_strs = {
375                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
376                 u'Email': username,
377                 u'GALX': galx,
378                 u'Passwd': password,
379                 u'PersistentCookie': u'yes',
380                 u'_utf8': u'霱',
381                 u'bgresponse': u'js_disabled',
382                 u'checkConnection': u'',
383                 u'checkedDomains': u'youtube',
384                 u'dnConn': u'',
385                 u'dsh': dsh,
386                 u'pstMsg': u'0',
387                 u'rmShown': u'1',
388                 u'secTok': u'',
389                 u'signIn': u'Sign in',
390                 u'timeStmp': u'',
391                 u'service': u'youtube',
392                 u'uilel': u'3',
393                 u'hl': u'en_US',
394         }
395         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
396         # chokes on unicode
397         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
398         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
399         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
400         try:
401             self.report_login()
402             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
403             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
404                 self._downloader.report_warning(u'unable to log in: bad username or password')
405                 return
406         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
407             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
408             return
409
410         # Confirm age
411         age_form = {
412                 'next_url':     '/',
413                 'action_confirm':   'Confirm',
414                 }
415         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
416         try:
417             self.report_age_confirmation()
418             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
419         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
420             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
421             return
422
423     def _extract_id(self, url):
424         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
425         if mobj is None:
426             self._downloader.report_error(u'invalid URL: %s' % url)
427             return
428         video_id = mobj.group(2)
429         return video_id
430
431     def _real_extract(self, url):
432         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
433         mobj = re.search(self._NEXT_URL_RE, url)
434         if mobj:
435             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
436         video_id = self._extract_id(url)
437
438         # Get video webpage
439         self.report_video_webpage_download(video_id)
440         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
441         request = compat_urllib_request.Request(url)
442         try:
443             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
446             return
447
448         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
449
450         # Attempt to extract SWF player URL
451         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
452         if mobj is not None:
453             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
454         else:
455             player_url = None
456
457         # Get video info
458         self.report_video_info_webpage_download(video_id)
459         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
460             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
461                     % (video_id, el_type))
462             request = compat_urllib_request.Request(video_info_url)
463             try:
464                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
465                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
466                 video_info = compat_parse_qs(video_info_webpage)
467                 if 'token' in video_info:
468                     break
469             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
471                 return
472         if 'token' not in video_info:
473             if 'reason' in video_info:
474                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
475             else:
476                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
477             return
478
479         # Check for "rental" videos
480         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
481             self._downloader.report_error(u'"rental" videos not supported')
482             return
483
484         # Start extracting information
485         self.report_information_extraction(video_id)
486
487         # uploader
488         if 'author' not in video_info:
489             self._downloader.report_error(u'unable to extract uploader name')
490             return
491         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
492
493         # uploader_id
494         video_uploader_id = None
495         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
496         if mobj is not None:
497             video_uploader_id = mobj.group(1)
498         else:
499             self._downloader.report_warning(u'unable to extract uploader nickname')
500
501         # title
502         if 'title' not in video_info:
503             self._downloader.report_error(u'unable to extract video title')
504             return
505         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
506
507         # thumbnail image
508         if 'thumbnail_url' not in video_info:
509             self._downloader.report_warning(u'unable to extract video thumbnail')
510             video_thumbnail = ''
511         else:   # don't panic if we can't find it
512             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
513
514         # upload date
515         upload_date = None
516         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
517         if mobj is not None:
518             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
519             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
520             for expression in format_expressions:
521                 try:
522                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
523                 except:
524                     pass
525
526         # description
527         video_description = get_element_by_id("eow-description", video_webpage)
528         if video_description:
529             video_description = clean_html(video_description)
530         else:
531             video_description = ''
532
533         # subtitles
534         video_subtitles = None
535
536         if self._downloader.params.get('writesubtitles', False):
537             video_subtitles = self._extract_subtitle(video_id)
538             if video_subtitles:
539                 (sub_error, sub_lang, sub) = video_subtitles[0]
540                 if sub_error:
541                     self._downloader.trouble(sub_error)
542
543         if self._downloader.params.get('allsubtitles', False):
544             video_subtitles = self._extract_all_subtitles(video_id)
545             for video_subtitle in video_subtitles:
546                 (sub_error, sub_lang, sub) = video_subtitle
547                 if sub_error:
548                     self._downloader.trouble(sub_error)
549
550         if self._downloader.params.get('listsubtitles', False):
551             sub_lang_list = self._list_available_subtitles(video_id)
552             return
553
554         if 'length_seconds' not in video_info:
555             self._downloader.report_warning(u'unable to extract video duration')
556             video_duration = ''
557         else:
558             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
559
560         # token
561         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
562
563         # Decide which formats to download
564         req_format = self._downloader.params.get('format', None)
565
566         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
567             self.report_rtmp_download()
568             video_url_list = [(None, video_info['conn'][0])]
569         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
570             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
571             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
572             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
573             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
574
575             format_limit = self._downloader.params.get('format_limit', None)
576             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
577             if format_limit is not None and format_limit in available_formats:
578                 format_list = available_formats[available_formats.index(format_limit):]
579             else:
580                 format_list = available_formats
581             existing_formats = [x for x in format_list if x in url_map]
582             if len(existing_formats) == 0:
583                 self._downloader.report_error(u'no known formats available for video')
584                 return
585             if self._downloader.params.get('listformats', None):
586                 self._print_formats(existing_formats)
587                 return
588             if req_format is None or req_format == 'best':
589                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
590             elif req_format == 'worst':
591                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
592             elif req_format in ('-1', 'all'):
593                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
594             else:
595                 # Specific formats. We pick the first in a slash-delimeted sequence.
596                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
597                 req_formats = req_format.split('/')
598                 video_url_list = None
599                 for rf in req_formats:
600                     if rf in url_map:
601                         video_url_list = [(rf, url_map[rf])]
602                         break
603                 if video_url_list is None:
604                     self._downloader.report_error(u'requested format not available')
605                     return
606         else:
607             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
608             return
609
610         results = []
611         for format_param, video_real_url in video_url_list:
612             # Extension
613             video_extension = self._video_extensions.get(format_param, 'flv')
614
615             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
616                                               self._video_dimensions.get(format_param, '???'))
617
618             results.append({
619                 'id':       video_id,
620                 'url':      video_real_url,
621                 'uploader': video_uploader,
622                 'uploader_id': video_uploader_id,
623                 'upload_date':  upload_date,
624                 'title':    video_title,
625                 'ext':      video_extension,
626                 'format':   video_format,
627                 'thumbnail':    video_thumbnail,
628                 'description':  video_description,
629                 'player_url':   player_url,
630                 'subtitles':    video_subtitles,
631                 'duration':     video_duration
632             })
633         return results
634
635
636 class MetacafeIE(InfoExtractor):
637     """Information Extractor for metacafe.com."""
638
639     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
640     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
641     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
642     IE_NAME = u'metacafe'
643
644     def __init__(self, downloader=None):
645         InfoExtractor.__init__(self, downloader)
646
647     def report_disclaimer(self):
648         """Report disclaimer retrieval."""
649         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
650
651     def report_age_confirmation(self):
652         """Report attempt to confirm age."""
653         self._downloader.to_screen(u'[metacafe] Confirming age')
654
655     def report_download_webpage(self, video_id):
656         """Report webpage download."""
657         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
658
659     def report_extraction(self, video_id):
660         """Report information extraction."""
661         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
662
663     def _real_initialize(self):
664         # Retrieve disclaimer
665         request = compat_urllib_request.Request(self._DISCLAIMER)
666         try:
667             self.report_disclaimer()
668             disclaimer = compat_urllib_request.urlopen(request).read()
669         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
670             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
671             return
672
673         # Confirm age
674         disclaimer_form = {
675             'filters': '0',
676             'submit': "Continue - I'm over 18",
677             }
678         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
679         try:
680             self.report_age_confirmation()
681             disclaimer = compat_urllib_request.urlopen(request).read()
682         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
683             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
684             return
685
686     def _real_extract(self, url):
687         # Extract id and simplified title from URL
688         mobj = re.match(self._VALID_URL, url)
689         if mobj is None:
690             self._downloader.report_error(u'invalid URL: %s' % url)
691             return
692
693         video_id = mobj.group(1)
694
695         # Check if video comes from YouTube
696         mobj2 = re.match(r'^yt-(.*)$', video_id)
697         if mobj2 is not None:
698             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
699             return
700
701         # Retrieve video webpage to extract further information
702         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
703         try:
704             self.report_download_webpage(video_id)
705             webpage = compat_urllib_request.urlopen(request).read()
706         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
707             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
708             return
709
710         # Extract URL, uploader and title from webpage
711         self.report_extraction(video_id)
712         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
713         if mobj is not None:
714             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
715             video_extension = mediaURL[-3:]
716
717             # Extract gdaKey if available
718             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
719             if mobj is None:
720                 video_url = mediaURL
721             else:
722                 gdaKey = mobj.group(1)
723                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
724         else:
725             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
726             if mobj is None:
727                 self._downloader.report_error(u'unable to extract media URL')
728                 return
729             vardict = compat_parse_qs(mobj.group(1))
730             if 'mediaData' not in vardict:
731                 self._downloader.report_error(u'unable to extract media URL')
732                 return
733             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
734             if mobj is None:
735                 self._downloader.report_error(u'unable to extract media URL')
736                 return
737             mediaURL = mobj.group(1).replace('\\/', '/')
738             video_extension = mediaURL[-3:]
739             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
740
741         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
742         if mobj is None:
743             self._downloader.report_error(u'unable to extract title')
744             return
745         video_title = mobj.group(1).decode('utf-8')
746
747         mobj = re.search(r'submitter=(.*?);', webpage)
748         if mobj is None:
749             self._downloader.report_error(u'unable to extract uploader nickname')
750             return
751         video_uploader = mobj.group(1)
752
753         return [{
754             'id':       video_id.decode('utf-8'),
755             'url':      video_url.decode('utf-8'),
756             'uploader': video_uploader.decode('utf-8'),
757             'upload_date':  None,
758             'title':    video_title,
759             'ext':      video_extension.decode('utf-8'),
760         }]
761
762
763 class DailymotionIE(InfoExtractor):
764     """Information Extractor for Dailymotion"""
765
766     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
767     IE_NAME = u'dailymotion'
768     _WORKING = False
769
770     def __init__(self, downloader=None):
771         InfoExtractor.__init__(self, downloader)
772
773     def report_extraction(self, video_id):
774         """Report information extraction."""
775         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
776
777     def _real_extract(self, url):
778         # Extract id and simplified title from URL
779         mobj = re.match(self._VALID_URL, url)
780         if mobj is None:
781             self._downloader.report_error(u'invalid URL: %s' % url)
782             return
783
784         video_id = mobj.group(1).split('_')[0].split('?')[0]
785
786         video_extension = 'mp4'
787
788         # Retrieve video webpage to extract further information
789         request = compat_urllib_request.Request(url)
790         request.add_header('Cookie', 'family_filter=off')
791         webpage = self._download_webpage(request, video_id)
792
793         # Extract URL, uploader and title from webpage
794         self.report_extraction(video_id)
795         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
796         if mobj is None:
797             self._downloader.report_error(u'unable to extract media URL')
798             return
799         flashvars = compat_urllib_parse.unquote(mobj.group(1))
800
801         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
802             if key in flashvars:
803                 max_quality = key
804                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
805                 break
806         else:
807             self._downloader.report_error(u'unable to extract video URL')
808             return
809
810         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
811         if mobj is None:
812             self._downloader.report_error(u'unable to extract video URL')
813             return
814
815         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
816
817         # TODO: support choosing qualities
818
819         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
820         if mobj is None:
821             self._downloader.report_error(u'unable to extract title')
822             return
823         video_title = unescapeHTML(mobj.group('title'))
824
825         video_uploader = None
826         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
827         if mobj is None:
828             # lookin for official user
829             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
830             if mobj_official is None:
831                 self._downloader.report_warning(u'unable to extract uploader nickname')
832             else:
833                 video_uploader = mobj_official.group(1)
834         else:
835             video_uploader = mobj.group(1)
836
837         video_upload_date = None
838         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
839         if mobj is not None:
840             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
841
842         return [{
843             'id':       video_id,
844             'url':      video_url,
845             'uploader': video_uploader,
846             'upload_date':  video_upload_date,
847             'title':    video_title,
848             'ext':      video_extension,
849         }]
850
851
852 class PhotobucketIE(InfoExtractor):
853     """Information extractor for photobucket.com."""
854
855     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856     IE_NAME = u'photobucket'
857
858     def __init__(self, downloader=None):
859         InfoExtractor.__init__(self, downloader)
860
861     def report_download_webpage(self, video_id):
862         """Report webpage download."""
863         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
864
865     def report_extraction(self, video_id):
866         """Report information extraction."""
867         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
868
869     def _real_extract(self, url):
870         # Extract id from URL
871         mobj = re.match(self._VALID_URL, url)
872         if mobj is None:
873             self._downloader.report_error(u'Invalid URL: %s' % url)
874             return
875
876         video_id = mobj.group(1)
877
878         video_extension = 'flv'
879
880         # Retrieve video webpage to extract further information
881         request = compat_urllib_request.Request(url)
882         try:
883             self.report_download_webpage(video_id)
884             webpage = compat_urllib_request.urlopen(request).read()
885         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
886             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
887             return
888
889         # Extract URL, uploader, and title from webpage
890         self.report_extraction(video_id)
891         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
892         if mobj is None:
893             self._downloader.report_error(u'unable to extract media URL')
894             return
895         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
896
897         video_url = mediaURL
898
899         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
900         if mobj is None:
901             self._downloader.report_error(u'unable to extract title')
902             return
903         video_title = mobj.group(1).decode('utf-8')
904
905         video_uploader = mobj.group(2).decode('utf-8')
906
907         return [{
908             'id':       video_id.decode('utf-8'),
909             'url':      video_url.decode('utf-8'),
910             'uploader': video_uploader,
911             'upload_date':  None,
912             'title':    video_title,
913             'ext':      video_extension.decode('utf-8'),
914         }]
915
916
917 class YahooIE(InfoExtractor):
918     """Information extractor for video.yahoo.com."""
919
920     _WORKING = False
921     # _VALID_URL matches all Yahoo! Video URLs
922     # _VPAGE_URL matches only the extractable '/watch/' URLs
923     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
924     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
925     IE_NAME = u'video.yahoo'
926
927     def __init__(self, downloader=None):
928         InfoExtractor.__init__(self, downloader)
929
930     def report_download_webpage(self, video_id):
931         """Report webpage download."""
932         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
933
934     def report_extraction(self, video_id):
935         """Report information extraction."""
936         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
937
938     def _real_extract(self, url, new_video=True):
939         # Extract ID from URL
940         mobj = re.match(self._VALID_URL, url)
941         if mobj is None:
942             self._downloader.report_error(u'Invalid URL: %s' % url)
943             return
944
945         video_id = mobj.group(2)
946         video_extension = 'flv'
947
948         # Rewrite valid but non-extractable URLs as
949         # extractable English language /watch/ URLs
950         if re.match(self._VPAGE_URL, url) is None:
951             request = compat_urllib_request.Request(url)
952             try:
953                 webpage = compat_urllib_request.urlopen(request).read()
954             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
955                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
956                 return
957
958             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
959             if mobj is None:
960                 self._downloader.report_error(u'Unable to extract id field')
961                 return
962             yahoo_id = mobj.group(1)
963
964             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
965             if mobj is None:
966                 self._downloader.report_error(u'Unable to extract vid field')
967                 return
968             yahoo_vid = mobj.group(1)
969
970             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
971             return self._real_extract(url, new_video=False)
972
973         # Retrieve video webpage to extract further information
974         request = compat_urllib_request.Request(url)
975         try:
976             self.report_download_webpage(video_id)
977             webpage = compat_urllib_request.urlopen(request).read()
978         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
980             return
981
982         # Extract uploader and title from webpage
983         self.report_extraction(video_id)
984         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
985         if mobj is None:
986             self._downloader.report_error(u'unable to extract video title')
987             return
988         video_title = mobj.group(1).decode('utf-8')
989
990         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
991         if mobj is None:
992             self._downloader.report_error(u'unable to extract video uploader')
993             return
994         video_uploader = mobj.group(1).decode('utf-8')
995
996         # Extract video thumbnail
997         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
998         if mobj is None:
999             self._downloader.report_error(u'unable to extract video thumbnail')
1000             return
1001         video_thumbnail = mobj.group(1).decode('utf-8')
1002
1003         # Extract video description
1004         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1005         if mobj is None:
1006             self._downloader.report_error(u'unable to extract video description')
1007             return
1008         video_description = mobj.group(1).decode('utf-8')
1009         if not video_description:
1010             video_description = 'No description available.'
1011
1012         # Extract video height and width
1013         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video height')
1016             return
1017         yv_video_height = mobj.group(1)
1018
1019         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1020         if mobj is None:
1021             self._downloader.report_error(u'unable to extract video width')
1022             return
1023         yv_video_width = mobj.group(1)
1024
1025         # Retrieve video playlist to extract media URL
1026         # I'm not completely sure what all these options are, but we
1027         # seem to need most of them, otherwise the server sends a 401.
1028         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1029         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1030         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1031                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1032                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1033         try:
1034             self.report_download_webpage(video_id)
1035             webpage = compat_urllib_request.urlopen(request).read()
1036         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1037             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1038             return
1039
1040         # Extract media URL from playlist XML
1041         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1042         if mobj is None:
1043             self._downloader.report_error(u'Unable to extract media URL')
1044             return
1045         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1046         video_url = unescapeHTML(video_url)
1047
1048         return [{
1049             'id':       video_id.decode('utf-8'),
1050             'url':      video_url,
1051             'uploader': video_uploader,
1052             'upload_date':  None,
1053             'title':    video_title,
1054             'ext':      video_extension.decode('utf-8'),
1055             'thumbnail':    video_thumbnail.decode('utf-8'),
1056             'description':  video_description,
1057         }]
1058
1059
1060 class VimeoIE(InfoExtractor):
1061     """Information extractor for vimeo.com."""
1062
1063     # _VALID_URL matches Vimeo URLs
1064     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1065     IE_NAME = u'vimeo'
1066
1067     def __init__(self, downloader=None):
1068         InfoExtractor.__init__(self, downloader)
1069
1070     def report_download_webpage(self, video_id):
1071         """Report webpage download."""
1072         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1073
1074     def report_extraction(self, video_id):
1075         """Report information extraction."""
1076         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1077
1078     def _real_extract(self, url, new_video=True):
1079         # Extract ID from URL
1080         mobj = re.match(self._VALID_URL, url)
1081         if mobj is None:
1082             self._downloader.report_error(u'Invalid URL: %s' % url)
1083             return
1084
1085         video_id = mobj.group('id')
1086         if not mobj.group('proto'):
1087             url = 'https://' + url
1088         if mobj.group('direct_link'):
1089             url = 'https://vimeo.com/' + video_id
1090
1091         # Retrieve video webpage to extract further information
1092         request = compat_urllib_request.Request(url, None, std_headers)
1093         try:
1094             self.report_download_webpage(video_id)
1095             webpage_bytes = compat_urllib_request.urlopen(request).read()
1096             webpage = webpage_bytes.decode('utf-8')
1097         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1098             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1099             return
1100
1101         # Now we begin extracting as much information as we can from what we
1102         # retrieved. First we extract the information common to all extractors,
1103         # and latter we extract those that are Vimeo specific.
1104         self.report_extraction(video_id)
1105
1106         # Extract the config JSON
1107         try:
1108             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1109             config = json.loads(config)
1110         except:
1111             self._downloader.report_error(u'unable to extract info section')
1112             return
1113
1114         # Extract title
1115         video_title = config["video"]["title"]
1116
1117         # Extract uploader and uploader_id
1118         video_uploader = config["video"]["owner"]["name"]
1119         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1120
1121         # Extract video thumbnail
1122         video_thumbnail = config["video"]["thumbnail"]
1123
1124         # Extract video description
1125         video_description = get_element_by_attribute("itemprop", "description", webpage)
1126         if video_description: video_description = clean_html(video_description)
1127         else: video_description = ''
1128
1129         # Extract upload date
1130         video_upload_date = None
1131         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1132         if mobj is not None:
1133             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1134
1135         # Vimeo specific: extract request signature and timestamp
1136         sig = config['request']['signature']
1137         timestamp = config['request']['timestamp']
1138
1139         # Vimeo specific: extract video codec and quality information
1140         # First consider quality, then codecs, then take everything
1141         # TODO bind to format param
1142         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1143         files = { 'hd': [], 'sd': [], 'other': []}
1144         for codec_name, codec_extension in codecs:
1145             if codec_name in config["video"]["files"]:
1146                 if 'hd' in config["video"]["files"][codec_name]:
1147                     files['hd'].append((codec_name, codec_extension, 'hd'))
1148                 elif 'sd' in config["video"]["files"][codec_name]:
1149                     files['sd'].append((codec_name, codec_extension, 'sd'))
1150                 else:
1151                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1152
1153         for quality in ('hd', 'sd', 'other'):
1154             if len(files[quality]) > 0:
1155                 video_quality = files[quality][0][2]
1156                 video_codec = files[quality][0][0]
1157                 video_extension = files[quality][0][1]
1158                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1159                 break
1160         else:
1161             self._downloader.report_error(u'no known codec found')
1162             return
1163
1164         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1165                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1166
1167         return [{
1168             'id':       video_id,
1169             'url':      video_url,
1170             'uploader': video_uploader,
1171             'uploader_id': video_uploader_id,
1172             'upload_date':  video_upload_date,
1173             'title':    video_title,
1174             'ext':      video_extension,
1175             'thumbnail':    video_thumbnail,
1176             'description':  video_description,
1177         }]
1178
1179
1180 class ArteTvIE(InfoExtractor):
1181     """arte.tv information extractor."""
1182
1183     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1184     _LIVE_URL = r'index-[0-9]+\.html$'
1185
1186     IE_NAME = u'arte.tv'
1187
1188     def __init__(self, downloader=None):
1189         InfoExtractor.__init__(self, downloader)
1190
1191     def report_download_webpage(self, video_id):
1192         """Report webpage download."""
1193         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1194
1195     def report_extraction(self, video_id):
1196         """Report information extraction."""
1197         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1198
1199     def fetch_webpage(self, url):
1200         request = compat_urllib_request.Request(url)
1201         try:
1202             self.report_download_webpage(url)
1203             webpage = compat_urllib_request.urlopen(request).read()
1204         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1205             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1206             return
1207         except ValueError as err:
1208             self._downloader.report_error(u'Invalid URL: %s' % url)
1209             return
1210         return webpage
1211
1212     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1213         page = self.fetch_webpage(url)
1214         mobj = re.search(regex, page, regexFlags)
1215         info = {}
1216
1217         if mobj is None:
1218             self._downloader.report_error(u'Invalid URL: %s' % url)
1219             return
1220
1221         for (i, key, err) in matchTuples:
1222             if mobj.group(i) is None:
1223                 self._downloader.trouble(err)
1224                 return
1225             else:
1226                 info[key] = mobj.group(i)
1227
1228         return info
1229
1230     def extractLiveStream(self, url):
1231         video_lang = url.split('/')[-4]
1232         info = self.grep_webpage(
1233             url,
1234             r'src="(.*?/videothek_js.*?\.js)',
1235             0,
1236             [
1237                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1238             ]
1239         )
1240         http_host = url.split('/')[2]
1241         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242         info = self.grep_webpage(
1243             next_url,
1244             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245                 '(http://.*?\.swf).*?' +
1246                 '(rtmp://.*?)\'',
1247             re.DOTALL,
1248             [
1249                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1250                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1251                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1252             ]
1253         )
1254         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1255
1256     def extractPlus7Stream(self, url):
1257         video_lang = url.split('/')[-3]
1258         info = self.grep_webpage(
1259             url,
1260             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1261             0,
1262             [
1263                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1264             ]
1265         )
1266         next_url = compat_urllib_parse.unquote(info.get('url'))
1267         info = self.grep_webpage(
1268             next_url,
1269             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1270             0,
1271             [
1272                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1273             ]
1274         )
1275         next_url = compat_urllib_parse.unquote(info.get('url'))
1276
1277         info = self.grep_webpage(
1278             next_url,
1279             r'<video id="(.*?)".*?>.*?' +
1280                 '<name>(.*?)</name>.*?' +
1281                 '<dateVideo>(.*?)</dateVideo>.*?' +
1282                 '<url quality="hd">(.*?)</url>',
1283             re.DOTALL,
1284             [
1285                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1286                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1287                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1288                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1289             ]
1290         )
1291
1292         return {
1293             'id':           info.get('id'),
1294             'url':          compat_urllib_parse.unquote(info.get('url')),
1295             'uploader':     u'arte.tv',
1296             'upload_date':  info.get('date'),
1297             'title':        info.get('title').decode('utf-8'),
1298             'ext':          u'mp4',
1299             'format':       u'NA',
1300             'player_url':   None,
1301         }
1302
1303     def _real_extract(self, url):
1304         video_id = url.split('/')[-1]
1305         self.report_extraction(video_id)
1306
1307         if re.search(self._LIVE_URL, video_id) is not None:
1308             self.extractLiveStream(url)
1309             return
1310         else:
1311             info = self.extractPlus7Stream(url)
1312
1313         return [info]
1314
1315
1316 class GenericIE(InfoExtractor):
1317     """Generic last-resort information extractor."""
1318
1319     _VALID_URL = r'.*'
1320     IE_NAME = u'generic'
1321
1322     def __init__(self, downloader=None):
1323         InfoExtractor.__init__(self, downloader)
1324
1325     def report_download_webpage(self, video_id):
1326         """Report webpage download."""
1327         if not self._downloader.params.get('test', False):
1328             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1329         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1330
1331     def report_extraction(self, video_id):
1332         """Report information extraction."""
1333         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1334
1335     def report_following_redirect(self, new_url):
1336         """Report information extraction."""
1337         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1338
1339     def _test_redirect(self, url):
1340         """Check if it is a redirect, like url shorteners, in case restart chain."""
1341         class HeadRequest(compat_urllib_request.Request):
1342             def get_method(self):
1343                 return "HEAD"
1344
1345         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1346             """
1347             Subclass the HTTPRedirectHandler to make it use our
1348             HeadRequest also on the redirected URL
1349             """
1350             def redirect_request(self, req, fp, code, msg, headers, newurl):
1351                 if code in (301, 302, 303, 307):
1352                     newurl = newurl.replace(' ', '%20')
1353                     newheaders = dict((k,v) for k,v in req.headers.items()
1354                                       if k.lower() not in ("content-length", "content-type"))
1355                     return HeadRequest(newurl,
1356                                        headers=newheaders,
1357                                        origin_req_host=req.get_origin_req_host(),
1358                                        unverifiable=True)
1359                 else:
1360                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1361
1362         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1363             """
1364             Fallback to GET if HEAD is not allowed (405 HTTP error)
1365             """
1366             def http_error_405(self, req, fp, code, msg, headers):
1367                 fp.read()
1368                 fp.close()
1369
1370                 newheaders = dict((k,v) for k,v in req.headers.items()
1371                                   if k.lower() not in ("content-length", "content-type"))
1372                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1373                                                  headers=newheaders,
1374                                                  origin_req_host=req.get_origin_req_host(),
1375                                                  unverifiable=True))
1376
1377         # Build our opener
1378         opener = compat_urllib_request.OpenerDirector()
1379         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1380                         HTTPMethodFallback, HEADRedirectHandler,
1381                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1382             opener.add_handler(handler())
1383
1384         response = opener.open(HeadRequest(url))
1385         new_url = response.geturl()
1386
1387         if url == new_url:
1388             return False
1389
1390         self.report_following_redirect(new_url)
1391         self._downloader.download([new_url])
1392         return True
1393
1394     def _real_extract(self, url):
1395         if self._test_redirect(url): return
1396
1397         video_id = url.split('/')[-1]
1398         try:
1399             webpage = self._download_webpage(url, video_id)
1400         except ValueError as err:
1401             # since this is the last-resort InfoExtractor, if
1402             # this error is thrown, it'll be thrown here
1403             self._downloader.report_error(u'Invalid URL: %s' % url)
1404             return
1405
1406         self.report_extraction(video_id)
1407         # Start with something easy: JW Player in SWFObject
1408         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Broaden the search a little bit
1411             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             # Broaden the search a little bit: JWPlayer JS loader
1414             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1415         if mobj is None:
1416             self._downloader.report_error(u'Invalid URL: %s' % url)
1417             return
1418
1419         # It's possible that one of the regexes
1420         # matched, but returned an empty group:
1421         if mobj.group(1) is None:
1422             self._downloader.report_error(u'Invalid URL: %s' % url)
1423             return
1424
1425         video_url = compat_urllib_parse.unquote(mobj.group(1))
1426         video_id = os.path.basename(video_url)
1427
1428         # here's a fun little line of code for you:
1429         video_extension = os.path.splitext(video_id)[1][1:]
1430         video_id = os.path.splitext(video_id)[0]
1431
1432         # it's tempting to parse this further, but you would
1433         # have to take into account all the variations like
1434         #   Video Title - Site Name
1435         #   Site Name | Video Title
1436         #   Video Title - Tagline | Site Name
1437         # and so on and so forth; it's just not practical
1438         mobj = re.search(r'<title>(.*)</title>', webpage)
1439         if mobj is None:
1440             self._downloader.report_error(u'unable to extract title')
1441             return
1442         video_title = mobj.group(1)
1443
1444         # video uploader is domain name
1445         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1446         if mobj is None:
1447             self._downloader.report_error(u'unable to extract title')
1448             return
1449         video_uploader = mobj.group(1)
1450
1451         return [{
1452             'id':       video_id,
1453             'url':      video_url,
1454             'uploader': video_uploader,
1455             'upload_date':  None,
1456             'title':    video_title,
1457             'ext':      video_extension,
1458         }]
1459
1460
1461 class YoutubeSearchIE(InfoExtractor):
1462     """Information Extractor for YouTube search queries."""
1463     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1464     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1465     _max_youtube_results = 1000
1466     IE_NAME = u'youtube:search'
1467
1468     def __init__(self, downloader=None):
1469         InfoExtractor.__init__(self, downloader)
1470
1471     def report_download_page(self, query, pagenum):
1472         """Report attempt to download search page with given number."""
1473         query = query.decode(preferredencoding())
1474         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1475
1476     def _real_extract(self, query):
1477         mobj = re.match(self._VALID_URL, query)
1478         if mobj is None:
1479             self._downloader.report_error(u'invalid search query "%s"' % query)
1480             return
1481
1482         prefix, query = query.split(':')
1483         prefix = prefix[8:]
1484         query = query.encode('utf-8')
1485         if prefix == '':
1486             self._download_n_results(query, 1)
1487             return
1488         elif prefix == 'all':
1489             self._download_n_results(query, self._max_youtube_results)
1490             return
1491         else:
1492             try:
1493                 n = int(prefix)
1494                 if n <= 0:
1495                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1496                     return
1497                 elif n > self._max_youtube_results:
1498                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1499                     n = self._max_youtube_results
1500                 self._download_n_results(query, n)
1501                 return
1502             except ValueError: # parsing prefix as integer fails
1503                 self._download_n_results(query, 1)
1504                 return
1505
1506     def _download_n_results(self, query, n):
1507         """Downloads a specified number of results for a query"""
1508
1509         video_ids = []
1510         pagenum = 0
1511         limit = n
1512
1513         while (50 * pagenum) < limit:
1514             self.report_download_page(query, pagenum+1)
1515             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1516             request = compat_urllib_request.Request(result_url)
1517             try:
1518                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1519             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1520                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1521                 return
1522             api_response = json.loads(data)['data']
1523
1524             if not 'items' in api_response:
1525                 self._downloader.trouble(u'[youtube] No video results')
1526                 return
1527
1528             new_ids = list(video['id'] for video in api_response['items'])
1529             video_ids += new_ids
1530
1531             limit = min(n, api_response['totalItems'])
1532             pagenum += 1
1533
1534         if len(video_ids) > n:
1535             video_ids = video_ids[:n]
1536         for id in video_ids:
1537             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1538         return
1539
1540
1541 class GoogleSearchIE(InfoExtractor):
1542     """Information Extractor for Google Video search queries."""
1543     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1544     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1545     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1546     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1547     _max_google_results = 1000
1548     IE_NAME = u'video.google:search'
1549
1550     def __init__(self, downloader=None):
1551         InfoExtractor.__init__(self, downloader)
1552
1553     def report_download_page(self, query, pagenum):
1554         """Report attempt to download playlist page with given number."""
1555         query = query.decode(preferredencoding())
1556         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1557
1558     def _real_extract(self, query):
1559         mobj = re.match(self._VALID_URL, query)
1560         if mobj is None:
1561             self._downloader.report_error(u'invalid search query "%s"' % query)
1562             return
1563
1564         prefix, query = query.split(':')
1565         prefix = prefix[8:]
1566         query = query.encode('utf-8')
1567         if prefix == '':
1568             self._download_n_results(query, 1)
1569             return
1570         elif prefix == 'all':
1571             self._download_n_results(query, self._max_google_results)
1572             return
1573         else:
1574             try:
1575                 n = int(prefix)
1576                 if n <= 0:
1577                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1578                     return
1579                 elif n > self._max_google_results:
1580                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1581                     n = self._max_google_results
1582                 self._download_n_results(query, n)
1583                 return
1584             except ValueError: # parsing prefix as integer fails
1585                 self._download_n_results(query, 1)
1586                 return
1587
1588     def _download_n_results(self, query, n):
1589         """Downloads a specified number of results for a query"""
1590
1591         video_ids = []
1592         pagenum = 0
1593
1594         while True:
1595             self.report_download_page(query, pagenum)
1596             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1597             request = compat_urllib_request.Request(result_url)
1598             try:
1599                 page = compat_urllib_request.urlopen(request).read()
1600             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1601                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1602                 return
1603
1604             # Extract video identifiers
1605             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1606                 video_id = mobj.group(1)
1607                 if video_id not in video_ids:
1608                     video_ids.append(video_id)
1609                     if len(video_ids) == n:
1610                         # Specified n videos reached
1611                         for id in video_ids:
1612                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1613                         return
1614
1615             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1616                 for id in video_ids:
1617                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1618                 return
1619
1620             pagenum = pagenum + 1
1621
1622
1623 class YahooSearchIE(InfoExtractor):
1624     """Information Extractor for Yahoo! Video search queries."""
1625
1626     _WORKING = False
1627     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1628     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1629     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1630     _MORE_PAGES_INDICATOR = r'\s*Next'
1631     _max_yahoo_results = 1000
1632     IE_NAME = u'video.yahoo:search'
1633
1634     def __init__(self, downloader=None):
1635         InfoExtractor.__init__(self, downloader)
1636
1637     def report_download_page(self, query, pagenum):
1638         """Report attempt to download playlist page with given number."""
1639         query = query.decode(preferredencoding())
1640         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1641
1642     def _real_extract(self, query):
1643         mobj = re.match(self._VALID_URL, query)
1644         if mobj is None:
1645             self._downloader.report_error(u'invalid search query "%s"' % query)
1646             return
1647
1648         prefix, query = query.split(':')
1649         prefix = prefix[8:]
1650         query = query.encode('utf-8')
1651         if prefix == '':
1652             self._download_n_results(query, 1)
1653             return
1654         elif prefix == 'all':
1655             self._download_n_results(query, self._max_yahoo_results)
1656             return
1657         else:
1658             try:
1659                 n = int(prefix)
1660                 if n <= 0:
1661                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1662                     return
1663                 elif n > self._max_yahoo_results:
1664                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1665                     n = self._max_yahoo_results
1666                 self._download_n_results(query, n)
1667                 return
1668             except ValueError: # parsing prefix as integer fails
1669                 self._download_n_results(query, 1)
1670                 return
1671
1672     def _download_n_results(self, query, n):
1673         """Downloads a specified number of results for a query"""
1674
1675         video_ids = []
1676         already_seen = set()
1677         pagenum = 1
1678
1679         while True:
1680             self.report_download_page(query, pagenum)
1681             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1682             request = compat_urllib_request.Request(result_url)
1683             try:
1684                 page = compat_urllib_request.urlopen(request).read()
1685             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1686                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1687                 return
1688
1689             # Extract video identifiers
1690             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691                 video_id = mobj.group(1)
1692                 if video_id not in already_seen:
1693                     video_ids.append(video_id)
1694                     already_seen.add(video_id)
1695                     if len(video_ids) == n:
1696                         # Specified n videos reached
1697                         for id in video_ids:
1698                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1699                         return
1700
1701             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1702                 for id in video_ids:
1703                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1704                 return
1705
1706             pagenum = pagenum + 1
1707
1708
1709 class YoutubePlaylistIE(InfoExtractor):
1710     """Information Extractor for YouTube playlists."""
1711
1712     _VALID_URL = r"""(?:
1713                         (?:https?://)?
1714                         (?:\w+\.)?
1715                         youtube\.com/
1716                         (?:
1717                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1718                            \? (?:.*?&)*? (?:p|a|list)=
1719                         |  user/.*?/user/
1720                         |  p/
1721                         |  user/.*?#[pg]/c/
1722                         )
1723                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1724                         .*
1725                      |
1726                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1727                      )"""
1728     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1729     _MAX_RESULTS = 50
1730     IE_NAME = u'youtube:playlist'
1731
1732     def __init__(self, downloader=None):
1733         InfoExtractor.__init__(self, downloader)
1734
1735     @classmethod
1736     def suitable(cls, url):
1737         """Receives a URL and returns True if suitable for this IE."""
1738         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1739
1740     def report_download_page(self, playlist_id, pagenum):
1741         """Report attempt to download playlist page with given number."""
1742         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1743
1744     def _real_extract(self, url):
1745         # Extract playlist id
1746         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1747         if mobj is None:
1748             self._downloader.report_error(u'invalid url: %s' % url)
1749             return
1750
1751         # Download playlist videos from API
1752         playlist_id = mobj.group(1) or mobj.group(2)
1753         page_num = 1
1754         videos = []
1755
1756         while True:
1757             self.report_download_page(playlist_id, page_num)
1758
1759             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1760             try:
1761                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1762             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1763                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1764                 return
1765
1766             try:
1767                 response = json.loads(page)
1768             except ValueError as err:
1769                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1770                 return
1771
1772             if not 'feed' in response or not 'entry' in response['feed']:
1773                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1774                 return
1775             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1776                         for entry in response['feed']['entry']
1777                         if 'content' in entry ]
1778
1779             if len(response['feed']['entry']) < self._MAX_RESULTS:
1780                 break
1781             page_num += 1
1782
1783         videos = [v[1] for v in sorted(videos)]
1784         total = len(videos)
1785
1786         playliststart = self._downloader.params.get('playliststart', 1) - 1
1787         playlistend = self._downloader.params.get('playlistend', -1)
1788         if playlistend == -1:
1789             videos = videos[playliststart:]
1790         else:
1791             videos = videos[playliststart:playlistend]
1792
1793         if len(videos) == total:
1794             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1795         else:
1796             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1797
1798         for video in videos:
1799             self._downloader.download([video])
1800         return
1801
1802
1803 class YoutubeChannelIE(InfoExtractor):
1804     """Information Extractor for YouTube channels."""
1805
1806     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1807     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1808     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1809     IE_NAME = u'youtube:channel'
1810
1811     def report_download_page(self, channel_id, pagenum):
1812         """Report attempt to download channel page with given number."""
1813         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1814
1815     def _real_extract(self, url):
1816         # Extract channel id
1817         mobj = re.match(self._VALID_URL, url)
1818         if mobj is None:
1819             self._downloader.report_error(u'invalid url: %s' % url)
1820             return
1821
1822         # Download channel pages
1823         channel_id = mobj.group(1)
1824         video_ids = []
1825         pagenum = 1
1826
1827         while True:
1828             self.report_download_page(channel_id, pagenum)
1829             url = self._TEMPLATE_URL % (channel_id, pagenum)
1830             request = compat_urllib_request.Request(url)
1831             try:
1832                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1833             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1835                 return
1836
1837             # Extract video identifiers
1838             ids_in_page = []
1839             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1840                 if mobj.group(1) not in ids_in_page:
1841                     ids_in_page.append(mobj.group(1))
1842             video_ids.extend(ids_in_page)
1843
1844             if self._MORE_PAGES_INDICATOR not in page:
1845                 break
1846             pagenum = pagenum + 1
1847
1848         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1849
1850         for id in video_ids:
1851             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1852         return
1853
1854
1855 class YoutubeUserIE(InfoExtractor):
1856     """Information Extractor for YouTube users."""
1857
1858     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1859     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1860     _GDATA_PAGE_SIZE = 50
1861     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1862     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1863     IE_NAME = u'youtube:user'
1864
1865     def __init__(self, downloader=None):
1866         InfoExtractor.__init__(self, downloader)
1867
1868     def report_download_page(self, username, start_index):
1869         """Report attempt to download user page."""
1870         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1871                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1872
1873     def _real_extract(self, url):
1874         # Extract username
1875         mobj = re.match(self._VALID_URL, url)
1876         if mobj is None:
1877             self._downloader.report_error(u'invalid url: %s' % url)
1878             return
1879
1880         username = mobj.group(1)
1881
1882         # Download video ids using YouTube Data API. Result size per
1883         # query is limited (currently to 50 videos) so we need to query
1884         # page by page until there are no video ids - it means we got
1885         # all of them.
1886
1887         video_ids = []
1888         pagenum = 0
1889
1890         while True:
1891             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1892             self.report_download_page(username, start_index)
1893
1894             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1895
1896             try:
1897                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1898             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1900                 return
1901
1902             # Extract video identifiers
1903             ids_in_page = []
1904
1905             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1906                 if mobj.group(1) not in ids_in_page:
1907                     ids_in_page.append(mobj.group(1))
1908
1909             video_ids.extend(ids_in_page)
1910
1911             # A little optimization - if current page is not
1912             # "full", ie. does not contain PAGE_SIZE video ids then
1913             # we can assume that this page is the last one - there
1914             # are no more ids on further pages - no need to query
1915             # again.
1916
1917             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1918                 break
1919
1920             pagenum += 1
1921
1922         all_ids_count = len(video_ids)
1923         playliststart = self._downloader.params.get('playliststart', 1) - 1
1924         playlistend = self._downloader.params.get('playlistend', -1)
1925
1926         if playlistend == -1:
1927             video_ids = video_ids[playliststart:]
1928         else:
1929             video_ids = video_ids[playliststart:playlistend]
1930
1931         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1932                 (username, all_ids_count, len(video_ids)))
1933
1934         for video_id in video_ids:
1935             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1936
1937
1938 class BlipTVUserIE(InfoExtractor):
1939     """Information Extractor for blip.tv users."""
1940
1941     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1942     _PAGE_SIZE = 12
1943     IE_NAME = u'blip.tv:user'
1944
1945     def __init__(self, downloader=None):
1946         InfoExtractor.__init__(self, downloader)
1947
1948     def report_download_page(self, username, pagenum):
1949         """Report attempt to download user page."""
1950         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1951                 (self.IE_NAME, username, pagenum))
1952
1953     def _real_extract(self, url):
1954         # Extract username
1955         mobj = re.match(self._VALID_URL, url)
1956         if mobj is None:
1957             self._downloader.report_error(u'invalid url: %s' % url)
1958             return
1959
1960         username = mobj.group(1)
1961
1962         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1963
1964         request = compat_urllib_request.Request(url)
1965
1966         try:
1967             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968             mobj = re.search(r'data-users-id="([^"]+)"', page)
1969             page_base = page_base % mobj.group(1)
1970         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1971             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1972             return
1973
1974
1975         # Download video ids using BlipTV Ajax calls. Result size per
1976         # query is limited (currently to 12 videos) so we need to query
1977         # page by page until there are no video ids - it means we got
1978         # all of them.
1979
1980         video_ids = []
1981         pagenum = 1
1982
1983         while True:
1984             self.report_download_page(username, pagenum)
1985             url = page_base + "&page=" + str(pagenum)
1986             request = compat_urllib_request.Request( url )
1987             try:
1988                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1989             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1991                 return
1992
1993             # Extract video identifiers
1994             ids_in_page = []
1995
1996             for mobj in re.finditer(r'href="/([^"]+)"', page):
1997                 if mobj.group(1) not in ids_in_page:
1998                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1999
2000             video_ids.extend(ids_in_page)
2001
2002             # A little optimization - if current page is not
2003             # "full", ie. does not contain PAGE_SIZE video ids then
2004             # we can assume that this page is the last one - there
2005             # are no more ids on further pages - no need to query
2006             # again.
2007
2008             if len(ids_in_page) < self._PAGE_SIZE:
2009                 break
2010
2011             pagenum += 1
2012
2013         all_ids_count = len(video_ids)
2014         playliststart = self._downloader.params.get('playliststart', 1) - 1
2015         playlistend = self._downloader.params.get('playlistend', -1)
2016
2017         if playlistend == -1:
2018             video_ids = video_ids[playliststart:]
2019         else:
2020             video_ids = video_ids[playliststart:playlistend]
2021
2022         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2023                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2024
2025         for video_id in video_ids:
2026             self._downloader.download([u'http://blip.tv/'+video_id])
2027
2028
2029 class DepositFilesIE(InfoExtractor):
2030     """Information extractor for depositfiles.com"""
2031
2032     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2033
2034     def report_download_webpage(self, file_id):
2035         """Report webpage download."""
2036         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2037
2038     def report_extraction(self, file_id):
2039         """Report information extraction."""
2040         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2041
2042     def _real_extract(self, url):
2043         file_id = url.split('/')[-1]
2044         # Rebuild url in english locale
2045         url = 'http://depositfiles.com/en/files/' + file_id
2046
2047         # Retrieve file webpage with 'Free download' button pressed
2048         free_download_indication = { 'gateway_result' : '1' }
2049         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2050         try:
2051             self.report_download_webpage(file_id)
2052             webpage = compat_urllib_request.urlopen(request).read()
2053         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2054             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2055             return
2056
2057         # Search for the real file URL
2058         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2059         if (mobj is None) or (mobj.group(1) is None):
2060             # Try to figure out reason of the error.
2061             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2062             if (mobj is not None) and (mobj.group(1) is not None):
2063                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2064                 self._downloader.report_error(u'%s' % restriction_message)
2065             else:
2066                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2067             return
2068
2069         file_url = mobj.group(1)
2070         file_extension = os.path.splitext(file_url)[1][1:]
2071
2072         # Search for file title
2073         mobj = re.search(r'<b title="(.*?)">', webpage)
2074         if mobj is None:
2075             self._downloader.report_error(u'unable to extract title')
2076             return
2077         file_title = mobj.group(1).decode('utf-8')
2078
2079         return [{
2080             'id':       file_id.decode('utf-8'),
2081             'url':      file_url.decode('utf-8'),
2082             'uploader': None,
2083             'upload_date':  None,
2084             'title':    file_title,
2085             'ext':      file_extension.decode('utf-8'),
2086         }]
2087
2088
2089 class FacebookIE(InfoExtractor):
2090     """Information Extractor for Facebook"""
2091
2092     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2093     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2094     _NETRC_MACHINE = 'facebook'
2095     IE_NAME = u'facebook'
2096
2097     def report_login(self):
2098         """Report attempt to log in."""
2099         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2100
2101     def _real_initialize(self):
2102         if self._downloader is None:
2103             return
2104
2105         useremail = None
2106         password = None
2107         downloader_params = self._downloader.params
2108
2109         # Attempt to use provided username and password or .netrc data
2110         if downloader_params.get('username', None) is not None:
2111             useremail = downloader_params['username']
2112             password = downloader_params['password']
2113         elif downloader_params.get('usenetrc', False):
2114             try:
2115                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2116                 if info is not None:
2117                     useremail = info[0]
2118                     password = info[2]
2119                 else:
2120                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2121             except (IOError, netrc.NetrcParseError) as err:
2122                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2123                 return
2124
2125         if useremail is None:
2126             return
2127
2128         # Log in
2129         login_form = {
2130             'email': useremail,
2131             'pass': password,
2132             'login': 'Log+In'
2133             }
2134         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2135         try:
2136             self.report_login()
2137             login_results = compat_urllib_request.urlopen(request).read()
2138             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2139                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2140                 return
2141         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2142             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2143             return
2144
2145     def _real_extract(self, url):
2146         mobj = re.match(self._VALID_URL, url)
2147         if mobj is None:
2148             self._downloader.report_error(u'invalid URL: %s' % url)
2149             return
2150         video_id = mobj.group('ID')
2151
2152         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2153         webpage = self._download_webpage(url, video_id)
2154
2155         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2156         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2157         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2158         if not m:
2159             raise ExtractorError(u'Cannot parse data')
2160         data = dict(json.loads(m.group(1)))
2161         params_raw = compat_urllib_parse.unquote(data['params'])
2162         params = json.loads(params_raw)
2163         video_url = params['hd_src']
2164         if not video_url:
2165             video_url = params['sd_src']
2166         if not video_url:
2167             raise ExtractorError(u'Cannot find video URL')
2168         video_duration = int(params['video_duration'])
2169
2170         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2171         if not m:
2172             raise ExtractorError(u'Cannot find title in webpage')
2173         video_title = unescapeHTML(m.group(1))
2174
2175         info = {
2176             'id': video_id,
2177             'title': video_title,
2178             'url': video_url,
2179             'ext': 'mp4',
2180             'duration': video_duration,
2181             'thumbnail': params['thumbnail_src'],
2182         }
2183         return [info]
2184
2185
2186 class BlipTVIE(InfoExtractor):
2187     """Information extractor for blip.tv"""
2188
2189     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2190     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2191     IE_NAME = u'blip.tv'
2192
2193     def report_extraction(self, file_id):
2194         """Report information extraction."""
2195         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2196
2197     def report_direct_download(self, title):
2198         """Report information extraction."""
2199         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2200
2201     def _real_extract(self, url):
2202         mobj = re.match(self._VALID_URL, url)
2203         if mobj is None:
2204             self._downloader.report_error(u'invalid URL: %s' % url)
2205             return
2206
2207         urlp = compat_urllib_parse_urlparse(url)
2208         if urlp.path.startswith('/play/'):
2209             request = compat_urllib_request.Request(url)
2210             response = compat_urllib_request.urlopen(request)
2211             redirecturl = response.geturl()
2212             rurlp = compat_urllib_parse_urlparse(redirecturl)
2213             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2214             url = 'http://blip.tv/a/a-' + file_id
2215             return self._real_extract(url)
2216
2217
2218         if '?' in url:
2219             cchar = '&'
2220         else:
2221             cchar = '?'
2222         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2223         request = compat_urllib_request.Request(json_url)
2224         request.add_header('User-Agent', 'iTunes/10.6.1')
2225         self.report_extraction(mobj.group(1))
2226         info = None
2227         try:
2228             urlh = compat_urllib_request.urlopen(request)
2229             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2230                 basename = url.split('/')[-1]
2231                 title,ext = os.path.splitext(basename)
2232                 title = title.decode('UTF-8')
2233                 ext = ext.replace('.', '')
2234                 self.report_direct_download(title)
2235                 info = {
2236                     'id': title,
2237                     'url': url,
2238                     'uploader': None,
2239                     'upload_date': None,
2240                     'title': title,
2241                     'ext': ext,
2242                     'urlhandle': urlh
2243                 }
2244         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2245             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2246         if info is None: # Regular URL
2247             try:
2248                 json_code_bytes = urlh.read()
2249                 json_code = json_code_bytes.decode('utf-8')
2250             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2251                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2252                 return
2253
2254             try:
2255                 json_data = json.loads(json_code)
2256                 if 'Post' in json_data:
2257                     data = json_data['Post']
2258                 else:
2259                     data = json_data
2260
2261                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2262                 video_url = data['media']['url']
2263                 umobj = re.match(self._URL_EXT, video_url)
2264                 if umobj is None:
2265                     raise ValueError('Can not determine filename extension')
2266                 ext = umobj.group(1)
2267
2268                 info = {
2269                     'id': data['item_id'],
2270                     'url': video_url,
2271                     'uploader': data['display_name'],
2272                     'upload_date': upload_date,
2273                     'title': data['title'],
2274                     'ext': ext,
2275                     'format': data['media']['mimeType'],
2276                     'thumbnail': data['thumbnailUrl'],
2277                     'description': data['description'],
2278                     'player_url': data['embedUrl'],
2279                     'user_agent': 'iTunes/10.6.1',
2280                 }
2281             except (ValueError,KeyError) as err:
2282                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2283                 return
2284
2285         return [info]
2286
2287
2288 class MyVideoIE(InfoExtractor):
2289     """Information Extractor for myvideo.de."""
2290
2291     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2292     IE_NAME = u'myvideo'
2293
2294     def __init__(self, downloader=None):
2295         InfoExtractor.__init__(self, downloader)
2296
2297     def report_extraction(self, video_id):
2298         """Report information extraction."""
2299         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2300
2301     def _real_extract(self,url):
2302         mobj = re.match(self._VALID_URL, url)
2303         if mobj is None:
2304             self._download.report_error(u'invalid URL: %s' % url)
2305             return
2306
2307         video_id = mobj.group(1)
2308
2309         # Get video webpage
2310         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2311         webpage = self._download_webpage(webpage_url, video_id)
2312
2313         self.report_extraction(video_id)
2314         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2315                  webpage)
2316         if mobj is None:
2317             self._downloader.report_error(u'unable to extract media URL')
2318             return
2319         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2320
2321         mobj = re.search('<title>([^<]+)</title>', webpage)
2322         if mobj is None:
2323             self._downloader.report_error(u'unable to extract title')
2324             return
2325
2326         video_title = mobj.group(1)
2327
2328         return [{
2329             'id':       video_id,
2330             'url':      video_url,
2331             'uploader': None,
2332             'upload_date':  None,
2333             'title':    video_title,
2334             'ext':      u'flv',
2335         }]
2336
2337 class ComedyCentralIE(InfoExtractor):
2338     """Information extractor for The Daily Show and Colbert Report """
2339
2340     # urls can be abbreviations like :thedailyshow or :colbert
2341     # urls for episodes like:
2342     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2343     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2344     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2345     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2346                       |(https?://)?(www\.)?
2347                           (?P<showname>thedailyshow|colbertnation)\.com/
2348                          (full-episodes/(?P<episode>.*)|
2349                           (?P<clip>
2350                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2351                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2352                      $"""
2353
2354     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2355
2356     _video_extensions = {
2357         '3500': 'mp4',
2358         '2200': 'mp4',
2359         '1700': 'mp4',
2360         '1200': 'mp4',
2361         '750': 'mp4',
2362         '400': 'mp4',
2363     }
2364     _video_dimensions = {
2365         '3500': '1280x720',
2366         '2200': '960x540',
2367         '1700': '768x432',
2368         '1200': '640x360',
2369         '750': '512x288',
2370         '400': '384x216',
2371     }
2372
2373     @classmethod
2374     def suitable(cls, url):
2375         """Receives a URL and returns True if suitable for this IE."""
2376         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2377
2378     def report_extraction(self, episode_id):
2379         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2380
2381     def report_config_download(self, episode_id, media_id):
2382         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2383
2384     def report_index_download(self, episode_id):
2385         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2386
2387     def _print_formats(self, formats):
2388         print('Available formats:')
2389         for x in formats:
2390             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2391
2392
2393     def _real_extract(self, url):
2394         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2395         if mobj is None:
2396             self._downloader.report_error(u'invalid URL: %s' % url)
2397             return
2398
2399         if mobj.group('shortname'):
2400             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2401                 url = u'http://www.thedailyshow.com/full-episodes/'
2402             else:
2403                 url = u'http://www.colbertnation.com/full-episodes/'
2404             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2405             assert mobj is not None
2406
2407         if mobj.group('clip'):
2408             if mobj.group('showname') == 'thedailyshow':
2409                 epTitle = mobj.group('tdstitle')
2410             else:
2411                 epTitle = mobj.group('cntitle')
2412             dlNewest = False
2413         else:
2414             dlNewest = not mobj.group('episode')
2415             if dlNewest:
2416                 epTitle = mobj.group('showname')
2417             else:
2418                 epTitle = mobj.group('episode')
2419
2420         req = compat_urllib_request.Request(url)
2421         self.report_extraction(epTitle)
2422         try:
2423             htmlHandle = compat_urllib_request.urlopen(req)
2424             html = htmlHandle.read()
2425             webpage = html.decode('utf-8')
2426         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2427             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2428             return
2429         if dlNewest:
2430             url = htmlHandle.geturl()
2431             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2432             if mobj is None:
2433                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2434                 return
2435             if mobj.group('episode') == '':
2436                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2437                 return
2438             epTitle = mobj.group('episode')
2439
2440         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2441
2442         if len(mMovieParams) == 0:
2443             # The Colbert Report embeds the information in a without
2444             # a URL prefix; so extract the alternate reference
2445             # and then add the URL prefix manually.
2446
2447             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2448             if len(altMovieParams) == 0:
2449                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2450                 return
2451             else:
2452                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2453
2454         uri = mMovieParams[0][1]
2455         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2456         self.report_index_download(epTitle)
2457         try:
2458             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2459         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2460             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2461             return
2462
2463         results = []
2464
2465         idoc = xml.etree.ElementTree.fromstring(indexXml)
2466         itemEls = idoc.findall('.//item')
2467         for partNum,itemEl in enumerate(itemEls):
2468             mediaId = itemEl.findall('./guid')[0].text
2469             shortMediaId = mediaId.split(':')[-1]
2470             showId = mediaId.split(':')[-2].replace('.com', '')
2471             officialTitle = itemEl.findall('./title')[0].text
2472             officialDate = itemEl.findall('./pubDate')[0].text
2473
2474             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2475                         compat_urllib_parse.urlencode({'uri': mediaId}))
2476             configReq = compat_urllib_request.Request(configUrl)
2477             self.report_config_download(epTitle, shortMediaId)
2478             try:
2479                 configXml = compat_urllib_request.urlopen(configReq).read()
2480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2481                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2482                 return
2483
2484             cdoc = xml.etree.ElementTree.fromstring(configXml)
2485             turls = []
2486             for rendition in cdoc.findall('.//rendition'):
2487                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2488                 turls.append(finfo)
2489
2490             if len(turls) == 0:
2491                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2492                 continue
2493
2494             if self._downloader.params.get('listformats', None):
2495                 self._print_formats([i[0] for i in turls])
2496                 return
2497
2498             # For now, just pick the highest bitrate
2499             format,rtmp_video_url = turls[-1]
2500
2501             # Get the format arg from the arg stream
2502             req_format = self._downloader.params.get('format', None)
2503
2504             # Select format if we can find one
2505             for f,v in turls:
2506                 if f == req_format:
2507                     format, rtmp_video_url = f, v
2508                     break
2509
2510             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2511             if not m:
2512                 raise ExtractorError(u'Cannot transform RTMP url')
2513             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2514             video_url = base + m.group('finalid')
2515
2516             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2517             info = {
2518                 'id': shortMediaId,
2519                 'url': video_url,
2520                 'uploader': showId,
2521                 'upload_date': officialDate,
2522                 'title': effTitle,
2523                 'ext': 'mp4',
2524                 'format': format,
2525                 'thumbnail': None,
2526                 'description': officialTitle,
2527             }
2528             results.append(info)
2529
2530         return results
2531
2532
2533 class EscapistIE(InfoExtractor):
2534     """Information extractor for The Escapist """
2535
2536     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2537     IE_NAME = u'escapist'
2538
2539     def report_extraction(self, showName):
2540         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2541
2542     def report_config_download(self, showName):
2543         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2544
2545     def _real_extract(self, url):
2546         mobj = re.match(self._VALID_URL, url)
2547         if mobj is None:
2548             self._downloader.report_error(u'invalid URL: %s' % url)
2549             return
2550         showName = mobj.group('showname')
2551         videoId = mobj.group('episode')
2552
2553         self.report_extraction(showName)
2554         try:
2555             webPage = compat_urllib_request.urlopen(url)
2556             webPageBytes = webPage.read()
2557             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2558             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2559         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2561             return
2562
2563         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2564         description = unescapeHTML(descMatch.group(1))
2565         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2566         imgUrl = unescapeHTML(imgMatch.group(1))
2567         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2568         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2569         configUrlMatch = re.search('config=(.*)$', playerUrl)
2570         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2571
2572         self.report_config_download(showName)
2573         try:
2574             configJSON = compat_urllib_request.urlopen(configUrl)
2575             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2576             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2577         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2578             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2579             return
2580
2581         # Technically, it's JavaScript, not JSON
2582         configJSON = configJSON.replace("'", '"')
2583
2584         try:
2585             config = json.loads(configJSON)
2586         except (ValueError,) as err:
2587             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2588             return
2589
2590         playlist = config['playlist']
2591         videoUrl = playlist[1]['url']
2592
2593         info = {
2594             'id': videoId,
2595             'url': videoUrl,
2596             'uploader': showName,
2597             'upload_date': None,
2598             'title': showName,
2599             'ext': 'mp4',
2600             'thumbnail': imgUrl,
2601             'description': description,
2602             'player_url': playerUrl,
2603         }
2604
2605         return [info]
2606
2607 class CollegeHumorIE(InfoExtractor):
2608     """Information extractor for collegehumor.com"""
2609
2610     _WORKING = False
2611     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2612     IE_NAME = u'collegehumor'
2613
2614     def report_manifest(self, video_id):
2615         """Report information extraction."""
2616         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2617
2618     def report_extraction(self, video_id):
2619         """Report information extraction."""
2620         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2621
2622     def _real_extract(self, url):
2623         mobj = re.match(self._VALID_URL, url)
2624         if mobj is None:
2625             self._downloader.report_error(u'invalid URL: %s' % url)
2626             return
2627         video_id = mobj.group('videoid')
2628
2629         info = {
2630             'id': video_id,
2631             'uploader': None,
2632             'upload_date': None,
2633         }
2634
2635         self.report_extraction(video_id)
2636         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2637         try:
2638             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2640             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2641             return
2642
2643         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2644         try:
2645             videoNode = mdoc.findall('./video')[0]
2646             info['description'] = videoNode.findall('./description')[0].text
2647             info['title'] = videoNode.findall('./caption')[0].text
2648             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2649             manifest_url = videoNode.findall('./file')[0].text
2650         except IndexError:
2651             self._downloader.report_error(u'Invalid metadata XML file')
2652             return
2653
2654         manifest_url += '?hdcore=2.10.3'
2655         self.report_manifest(video_id)
2656         try:
2657             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2658         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2660             return
2661
2662         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2663         try:
2664             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2665             node_id = media_node.attrib['url']
2666             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2667         except IndexError as err:
2668             self._downloader.report_error(u'Invalid manifest file')
2669             return
2670
2671         url_pr = compat_urllib_parse_urlparse(manifest_url)
2672         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2673
2674         info['url'] = url
2675         info['ext'] = 'f4f'
2676         return [info]
2677
2678
2679 class XVideosIE(InfoExtractor):
2680     """Information extractor for xvideos.com"""
2681
2682     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2683     IE_NAME = u'xvideos'
2684
2685     def report_extraction(self, video_id):
2686         """Report information extraction."""
2687         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2688
2689     def _real_extract(self, url):
2690         mobj = re.match(self._VALID_URL, url)
2691         if mobj is None:
2692             self._downloader.report_error(u'invalid URL: %s' % url)
2693             return
2694         video_id = mobj.group(1)
2695
2696         webpage = self._download_webpage(url, video_id)
2697
2698         self.report_extraction(video_id)
2699
2700
2701         # Extract video URL
2702         mobj = re.search(r'flv_url=(.+?)&', webpage)
2703         if mobj is None:
2704             self._downloader.report_error(u'unable to extract video url')
2705             return
2706         video_url = compat_urllib_parse.unquote(mobj.group(1))
2707
2708
2709         # Extract title
2710         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2711         if mobj is None:
2712             self._downloader.report_error(u'unable to extract video title')
2713             return
2714         video_title = mobj.group(1)
2715
2716
2717         # Extract video thumbnail
2718         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2719         if mobj is None:
2720             self._downloader.report_error(u'unable to extract video thumbnail')
2721             return
2722         video_thumbnail = mobj.group(0)
2723
2724         info = {
2725             'id': video_id,
2726             'url': video_url,
2727             'uploader': None,
2728             'upload_date': None,
2729             'title': video_title,
2730             'ext': 'flv',
2731             'thumbnail': video_thumbnail,
2732             'description': None,
2733         }
2734
2735         return [info]
2736
2737
2738 class SoundcloudIE(InfoExtractor):
2739     """Information extractor for soundcloud.com
2740        To access the media, the uid of the song and a stream token
2741        must be extracted from the page source and the script must make
2742        a request to media.soundcloud.com/crossdomain.xml. Then
2743        the media can be grabbed by requesting from an url composed
2744        of the stream token and uid
2745      """
2746
2747     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2748     IE_NAME = u'soundcloud'
2749
2750     def __init__(self, downloader=None):
2751         InfoExtractor.__init__(self, downloader)
2752
2753     def report_resolve(self, video_id):
2754         """Report information extraction."""
2755         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2756
2757     def report_extraction(self, video_id):
2758         """Report information extraction."""
2759         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2760
2761     def _real_extract(self, url):
2762         mobj = re.match(self._VALID_URL, url)
2763         if mobj is None:
2764             self._downloader.report_error(u'invalid URL: %s' % url)
2765             return
2766
2767         # extract uploader (which is in the url)
2768         uploader = mobj.group(1)
2769         # extract simple title (uploader + slug of song title)
2770         slug_title =  mobj.group(2)
2771         simple_title = uploader + u'-' + slug_title
2772
2773         self.report_resolve('%s/%s' % (uploader, slug_title))
2774
2775         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2776         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777         request = compat_urllib_request.Request(resolv_url)
2778         try:
2779             info_json_bytes = compat_urllib_request.urlopen(request).read()
2780             info_json = info_json_bytes.decode('utf-8')
2781         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2783             return
2784
2785         info = json.loads(info_json)
2786         video_id = info['id']
2787         self.report_extraction('%s/%s' % (uploader, slug_title))
2788
2789         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2790         request = compat_urllib_request.Request(streams_url)
2791         try:
2792             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2793             stream_json = stream_json_bytes.decode('utf-8')
2794         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2795             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2796             return
2797
2798         streams = json.loads(stream_json)
2799         mediaURL = streams['http_mp3_128_url']
2800
2801         return [{
2802             'id':       info['id'],
2803             'url':      mediaURL,
2804             'uploader': info['user']['username'],
2805             'upload_date':  info['created_at'],
2806             'title':    info['title'],
2807             'ext':      u'mp3',
2808             'description': info['description'],
2809         }]
2810
2811 class SoundcloudSetIE(InfoExtractor):
2812     """Information extractor for soundcloud.com sets
2813        To access the media, the uid of the song and a stream token
2814        must be extracted from the page source and the script must make
2815        a request to media.soundcloud.com/crossdomain.xml. Then
2816        the media can be grabbed by requesting from an url composed
2817        of the stream token and uid
2818      """
2819
2820     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2821     IE_NAME = u'soundcloud'
2822
2823     def __init__(self, downloader=None):
2824         InfoExtractor.__init__(self, downloader)
2825
2826     def report_resolve(self, video_id):
2827         """Report information extraction."""
2828         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2829
2830     def report_extraction(self, video_id):
2831         """Report information extraction."""
2832         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2833
2834     def _real_extract(self, url):
2835         mobj = re.match(self._VALID_URL, url)
2836         if mobj is None:
2837             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2838             return
2839
2840         # extract uploader (which is in the url)
2841         uploader = mobj.group(1)
2842         # extract simple title (uploader + slug of song title)
2843         slug_title =  mobj.group(2)
2844         simple_title = uploader + u'-' + slug_title
2845
2846         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2847
2848         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2849         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2850         request = compat_urllib_request.Request(resolv_url)
2851         try:
2852             info_json_bytes = compat_urllib_request.urlopen(request).read()
2853             info_json = info_json_bytes.decode('utf-8')
2854         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2855             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2856             return
2857
2858         videos = []
2859         info = json.loads(info_json)
2860         if 'errors' in info:
2861             for err in info['errors']:
2862                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2863             return
2864
2865         for track in info['tracks']:
2866             video_id = track['id']
2867             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2868
2869             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2870             request = compat_urllib_request.Request(streams_url)
2871             try:
2872                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2873                 stream_json = stream_json_bytes.decode('utf-8')
2874             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2876                 return
2877
2878             streams = json.loads(stream_json)
2879             mediaURL = streams['http_mp3_128_url']
2880
2881             videos.append({
2882                 'id':       video_id,
2883                 'url':      mediaURL,
2884                 'uploader': track['user']['username'],
2885                 'upload_date':  track['created_at'],
2886                 'title':    track['title'],
2887                 'ext':      u'mp3',
2888                 'description': track['description'],
2889             })
2890         return videos
2891
2892
2893 class InfoQIE(InfoExtractor):
2894     """Information extractor for infoq.com"""
2895     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2896
2897     def report_extraction(self, video_id):
2898         """Report information extraction."""
2899         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2900
2901     def _real_extract(self, url):
2902         mobj = re.match(self._VALID_URL, url)
2903         if mobj is None:
2904             self._downloader.report_error(u'invalid URL: %s' % url)
2905             return
2906
2907         webpage = self._download_webpage(url, video_id=url)
2908         self.report_extraction(url)
2909
2910         # Extract video URL
2911         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2912         if mobj is None:
2913             self._downloader.report_error(u'unable to extract video url')
2914             return
2915         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2916         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2917
2918         # Extract title
2919         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2920         if mobj is None:
2921             self._downloader.report_error(u'unable to extract video title')
2922             return
2923         video_title = mobj.group(1)
2924
2925         # Extract description
2926         video_description = u'No description available.'
2927         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2928         if mobj is not None:
2929             video_description = mobj.group(1)
2930
2931         video_filename = video_url.split('/')[-1]
2932         video_id, extension = video_filename.split('.')
2933
2934         info = {
2935             'id': video_id,
2936             'url': video_url,
2937             'uploader': None,
2938             'upload_date': None,
2939             'title': video_title,
2940             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2941             'thumbnail': None,
2942             'description': video_description,
2943         }
2944
2945         return [info]
2946
2947 class MixcloudIE(InfoExtractor):
2948     """Information extractor for www.mixcloud.com"""
2949
2950     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2951     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2952     IE_NAME = u'mixcloud'
2953
2954     def __init__(self, downloader=None):
2955         InfoExtractor.__init__(self, downloader)
2956
2957     def report_download_json(self, file_id):
2958         """Report JSON download."""
2959         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2960
2961     def report_extraction(self, file_id):
2962         """Report information extraction."""
2963         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2964
2965     def get_urls(self, jsonData, fmt, bitrate='best'):
2966         """Get urls from 'audio_formats' section in json"""
2967         file_url = None
2968         try:
2969             bitrate_list = jsonData[fmt]
2970             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2971                 bitrate = max(bitrate_list) # select highest
2972
2973             url_list = jsonData[fmt][bitrate]
2974         except TypeError: # we have no bitrate info.
2975             url_list = jsonData[fmt]
2976         return url_list
2977
2978     def check_urls(self, url_list):
2979         """Returns 1st active url from list"""
2980         for url in url_list:
2981             try:
2982                 compat_urllib_request.urlopen(url)
2983                 return url
2984             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2985                 url = None
2986
2987         return None
2988
2989     def _print_formats(self, formats):
2990         print('Available formats:')
2991         for fmt in formats.keys():
2992             for b in formats[fmt]:
2993                 try:
2994                     ext = formats[fmt][b][0]
2995                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2996                 except TypeError: # we have no bitrate info
2997                     ext = formats[fmt][0]
2998                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2999                     break
3000
3001     def _real_extract(self, url):
3002         mobj = re.match(self._VALID_URL, url)
3003         if mobj is None:
3004             self._downloader.report_error(u'invalid URL: %s' % url)
3005             return
3006         # extract uploader & filename from url
3007         uploader = mobj.group(1).decode('utf-8')
3008         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3009
3010         # construct API request
3011         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3012         # retrieve .json file with links to files
3013         request = compat_urllib_request.Request(file_url)
3014         try:
3015             self.report_download_json(file_url)
3016             jsonData = compat_urllib_request.urlopen(request).read()
3017         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3018             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3019             return
3020
3021         # parse JSON
3022         json_data = json.loads(jsonData)
3023         player_url = json_data['player_swf_url']
3024         formats = dict(json_data['audio_formats'])
3025
3026         req_format = self._downloader.params.get('format', None)
3027         bitrate = None
3028
3029         if self._downloader.params.get('listformats', None):
3030             self._print_formats(formats)
3031             return
3032
3033         if req_format is None or req_format == 'best':
3034             for format_param in formats.keys():
3035                 url_list = self.get_urls(formats, format_param)
3036                 # check urls
3037                 file_url = self.check_urls(url_list)
3038                 if file_url is not None:
3039                     break # got it!
3040         else:
3041             if req_format not in formats:
3042                 self._downloader.report_error(u'format is not available')
3043                 return
3044
3045             url_list = self.get_urls(formats, req_format)
3046             file_url = self.check_urls(url_list)
3047             format_param = req_format
3048
3049         return [{
3050             'id': file_id.decode('utf-8'),
3051             'url': file_url.decode('utf-8'),
3052             'uploader': uploader.decode('utf-8'),
3053             'upload_date': None,
3054             'title': json_data['name'],
3055             'ext': file_url.split('.')[-1].decode('utf-8'),
3056             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3057             'thumbnail': json_data['thumbnail_url'],
3058             'description': json_data['description'],
3059             'player_url': player_url.decode('utf-8'),
3060         }]
3061
3062 class StanfordOpenClassroomIE(InfoExtractor):
3063     """Information extractor for Stanford's Open ClassRoom"""
3064
3065     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3066     IE_NAME = u'stanfordoc'
3067
3068     def report_download_webpage(self, objid):
3069         """Report information extraction."""
3070         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3071
3072     def report_extraction(self, video_id):
3073         """Report information extraction."""
3074         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3075
3076     def _real_extract(self, url):
3077         mobj = re.match(self._VALID_URL, url)
3078         if mobj is None:
3079             raise ExtractorError(u'Invalid URL: %s' % url)
3080
3081         if mobj.group('course') and mobj.group('video'): # A specific video
3082             course = mobj.group('course')
3083             video = mobj.group('video')
3084             info = {
3085                 'id': course + '_' + video,
3086                 'uploader': None,
3087                 'upload_date': None,
3088             }
3089
3090             self.report_extraction(info['id'])
3091             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3092             xmlUrl = baseUrl + video + '.xml'
3093             try:
3094                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3095             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3096                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3097                 return
3098             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3099             try:
3100                 info['title'] = mdoc.findall('./title')[0].text
3101                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3102             except IndexError:
3103                 self._downloader.report_error(u'Invalid metadata XML file')
3104                 return
3105             info['ext'] = info['url'].rpartition('.')[2]
3106             return [info]
3107         elif mobj.group('course'): # A course page
3108             course = mobj.group('course')
3109             info = {
3110                 'id': course,
3111                 'type': 'playlist',
3112                 'uploader': None,
3113                 'upload_date': None,
3114             }
3115
3116             coursepage = self._download_webpage(url, info['id'],
3117                                         note='Downloading course info page',
3118                                         errnote='Unable to download course info page')
3119
3120             m = re.search('<h1>([^<]+)</h1>', coursepage)
3121             if m:
3122                 info['title'] = unescapeHTML(m.group(1))
3123             else:
3124                 info['title'] = info['id']
3125
3126             m = re.search('<description>([^<]+)</description>', coursepage)
3127             if m:
3128                 info['description'] = unescapeHTML(m.group(1))
3129
3130             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3131             info['list'] = [
3132                 {
3133                     'type': 'reference',
3134                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3135                 }
3136                     for vpage in links]
3137             results = []
3138             for entry in info['list']:
3139                 assert entry['type'] == 'reference'
3140                 results += self.extract(entry['url'])
3141             return results
3142         else: # Root page
3143             info = {
3144                 'id': 'Stanford OpenClassroom',
3145                 'type': 'playlist',
3146                 'uploader': None,
3147                 'upload_date': None,
3148             }
3149
3150             self.report_download_webpage(info['id'])
3151             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3152             try:
3153                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3154             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3155                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3156                 return
3157
3158             info['title'] = info['id']
3159
3160             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3161             info['list'] = [
3162                 {
3163                     'type': 'reference',
3164                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3165                 }
3166                     for cpage in links]
3167
3168             results = []
3169             for entry in info['list']:
3170                 assert entry['type'] == 'reference'
3171                 results += self.extract(entry['url'])
3172             return results
3173
3174 class MTVIE(InfoExtractor):
3175     """Information extractor for MTV.com"""
3176
3177     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3178     IE_NAME = u'mtv'
3179
3180     def report_extraction(self, video_id):
3181         """Report information extraction."""
3182         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3183
3184     def _real_extract(self, url):
3185         mobj = re.match(self._VALID_URL, url)
3186         if mobj is None:
3187             self._downloader.report_error(u'invalid URL: %s' % url)
3188             return
3189         if not mobj.group('proto'):
3190             url = 'http://' + url
3191         video_id = mobj.group('videoid')
3192
3193         webpage = self._download_webpage(url, video_id)
3194
3195         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3196         if mobj is None:
3197             self._downloader.report_error(u'unable to extract song name')
3198             return
3199         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3200         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3201         if mobj is None:
3202             self._downloader.report_error(u'unable to extract performer')
3203             return
3204         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3205         video_title = performer + ' - ' + song_name
3206
3207         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3208         if mobj is None:
3209             self._downloader.report_error(u'unable to mtvn_uri')
3210             return
3211         mtvn_uri = mobj.group(1)
3212
3213         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3214         if mobj is None:
3215             self._downloader.report_error(u'unable to extract content id')
3216             return
3217         content_id = mobj.group(1)
3218
3219         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3220         self.report_extraction(video_id)
3221         request = compat_urllib_request.Request(videogen_url)
3222         try:
3223             metadataXml = compat_urllib_request.urlopen(request).read()
3224         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3225             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3226             return
3227
3228         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3229         renditions = mdoc.findall('.//rendition')
3230
3231         # For now, always pick the highest quality.
3232         rendition = renditions[-1]
3233
3234         try:
3235             _,_,ext = rendition.attrib['type'].partition('/')
3236             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3237             video_url = rendition.find('./src').text
3238         except KeyError:
3239             self._downloader.trouble('Invalid rendition field.')
3240             return
3241
3242         info = {
3243             'id': video_id,
3244             'url': video_url,
3245             'uploader': performer,
3246             'upload_date': None,
3247             'title': video_title,
3248             'ext': ext,
3249             'format': format,
3250         }
3251
3252         return [info]
3253
3254
3255 class YoukuIE(InfoExtractor):
3256     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3257
3258     def report_download_webpage(self, file_id):
3259         """Report webpage download."""
3260         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3261
3262     def report_extraction(self, file_id):
3263         """Report information extraction."""
3264         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3265
3266     def _gen_sid(self):
3267         nowTime = int(time.time() * 1000)
3268         random1 = random.randint(1000,1998)
3269         random2 = random.randint(1000,9999)
3270
3271         return "%d%d%d" %(nowTime,random1,random2)
3272
3273     def _get_file_ID_mix_string(self, seed):
3274         mixed = []
3275         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3276         seed = float(seed)
3277         for i in range(len(source)):
3278             seed  =  (seed * 211 + 30031 ) % 65536
3279             index  =  math.floor(seed / 65536 * len(source) )
3280             mixed.append(source[int(index)])
3281             source.remove(source[int(index)])
3282         #return ''.join(mixed)
3283         return mixed
3284
3285     def _get_file_id(self, fileId, seed):
3286         mixed = self._get_file_ID_mix_string(seed)
3287         ids = fileId.split('*')
3288         realId = []
3289         for ch in ids:
3290             if ch:
3291                 realId.append(mixed[int(ch)])
3292         return ''.join(realId)
3293
3294     def _real_extract(self, url):
3295         mobj = re.match(self._VALID_URL, url)
3296         if mobj is None:
3297             self._downloader.report_error(u'invalid URL: %s' % url)
3298             return
3299         video_id = mobj.group('ID')
3300
3301         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3302
3303         request = compat_urllib_request.Request(info_url, None, std_headers)
3304         try:
3305             self.report_download_webpage(video_id)
3306             jsondata = compat_urllib_request.urlopen(request).read()
3307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3309             return
3310
3311         self.report_extraction(video_id)
3312         try:
3313             jsonstr = jsondata.decode('utf-8')
3314             config = json.loads(jsonstr)
3315
3316             video_title =  config['data'][0]['title']
3317             seed = config['data'][0]['seed']
3318
3319             format = self._downloader.params.get('format', None)
3320             supported_format = list(config['data'][0]['streamfileids'].keys())
3321
3322             if format is None or format == 'best':
3323                 if 'hd2' in supported_format:
3324                     format = 'hd2'
3325                 else:
3326                     format = 'flv'
3327                 ext = u'flv'
3328             elif format == 'worst':
3329                 format = 'mp4'
3330                 ext = u'mp4'
3331             else:
3332                 format = 'flv'
3333                 ext = u'flv'
3334
3335
3336             fileid = config['data'][0]['streamfileids'][format]
3337             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3338         except (UnicodeDecodeError, ValueError, KeyError):
3339             self._downloader.report_error(u'unable to extract info section')
3340             return
3341
3342         files_info=[]
3343         sid = self._gen_sid()
3344         fileid = self._get_file_id(fileid, seed)
3345
3346         #column 8,9 of fileid represent the segment number
3347         #fileid[7:9] should be changed
3348         for index, key in enumerate(keys):
3349
3350             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3351             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3352
3353             info = {
3354                 'id': '%s_part%02d' % (video_id, index),
3355                 'url': download_url,
3356                 'uploader': None,
3357                 'upload_date': None,
3358                 'title': video_title,
3359                 'ext': ext,
3360             }
3361             files_info.append(info)
3362
3363         return files_info
3364
3365
3366 class XNXXIE(InfoExtractor):
3367     """Information extractor for xnxx.com"""
3368
3369     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3370     IE_NAME = u'xnxx'
3371     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3372     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3373     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3374
3375     def report_webpage(self, video_id):
3376         """Report information extraction"""
3377         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3378
3379     def report_extraction(self, video_id):
3380         """Report information extraction"""
3381         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3382
3383     def _real_extract(self, url):
3384         mobj = re.match(self._VALID_URL, url)
3385         if mobj is None:
3386             self._downloader.report_error(u'invalid URL: %s' % url)
3387             return
3388         video_id = mobj.group(1)
3389
3390         self.report_webpage(video_id)
3391
3392         # Get webpage content
3393         try:
3394             webpage_bytes = compat_urllib_request.urlopen(url).read()
3395             webpage = webpage_bytes.decode('utf-8')
3396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3397             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3398             return
3399
3400         result = re.search(self.VIDEO_URL_RE, webpage)
3401         if result is None:
3402             self._downloader.report_error(u'unable to extract video url')
3403             return
3404         video_url = compat_urllib_parse.unquote(result.group(1))
3405
3406         result = re.search(self.VIDEO_TITLE_RE, webpage)
3407         if result is None:
3408             self._downloader.report_error(u'unable to extract video title')
3409             return
3410         video_title = result.group(1)
3411
3412         result = re.search(self.VIDEO_THUMB_RE, webpage)
3413         if result is None:
3414             self._downloader.report_error(u'unable to extract video thumbnail')
3415             return
3416         video_thumbnail = result.group(1)
3417
3418         return [{
3419             'id': video_id,
3420             'url': video_url,
3421             'uploader': None,
3422             'upload_date': None,
3423             'title': video_title,
3424             'ext': 'flv',
3425             'thumbnail': video_thumbnail,
3426             'description': None,
3427         }]
3428
3429
3430 class GooglePlusIE(InfoExtractor):
3431     """Information extractor for plus.google.com."""
3432
3433     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3434     IE_NAME = u'plus.google'
3435
3436     def __init__(self, downloader=None):
3437         InfoExtractor.__init__(self, downloader)
3438
3439     def report_extract_entry(self, url):
3440         """Report downloading extry"""
3441         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3442
3443     def report_date(self, upload_date):
3444         """Report downloading extry"""
3445         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3446
3447     def report_uploader(self, uploader):
3448         """Report downloading extry"""
3449         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3450
3451     def report_title(self, video_title):
3452         """Report downloading extry"""
3453         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3454
3455     def report_extract_vid_page(self, video_page):
3456         """Report information extraction."""
3457         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3458
3459     def _real_extract(self, url):
3460         # Extract id from URL
3461         mobj = re.match(self._VALID_URL, url)
3462         if mobj is None:
3463             self._downloader.report_error(u'Invalid URL: %s' % url)
3464             return
3465
3466         post_url = mobj.group(0)
3467         video_id = mobj.group(1)
3468
3469         video_extension = 'flv'
3470
3471         # Step 1, Retrieve post webpage to extract further information
3472         self.report_extract_entry(post_url)
3473         request = compat_urllib_request.Request(post_url)
3474         try:
3475             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3476         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3477             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3478             return
3479
3480         # Extract update date
3481         upload_date = None
3482         pattern = 'title="Timestamp">(.*?)</a>'
3483         mobj = re.search(pattern, webpage)
3484         if mobj:
3485             upload_date = mobj.group(1)
3486             # Convert timestring to a format suitable for filename
3487             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3488             upload_date = upload_date.strftime('%Y%m%d')
3489         self.report_date(upload_date)
3490
3491         # Extract uploader
3492         uploader = None
3493         pattern = r'rel\="author".*?>(.*?)</a>'
3494         mobj = re.search(pattern, webpage)
3495         if mobj:
3496             uploader = mobj.group(1)
3497         self.report_uploader(uploader)
3498
3499         # Extract title
3500         # Get the first line for title
3501         video_title = u'NA'
3502         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3503         mobj = re.search(pattern, webpage)
3504         if mobj:
3505             video_title = mobj.group(1)
3506         self.report_title(video_title)
3507
3508         # Step 2, Stimulate clicking the image box to launch video
3509         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3510         mobj = re.search(pattern, webpage)
3511         if mobj is None:
3512             self._downloader.report_error(u'unable to extract video page URL')
3513
3514         video_page = mobj.group(1)
3515         request = compat_urllib_request.Request(video_page)
3516         try:
3517             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3518         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3519             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3520             return
3521         self.report_extract_vid_page(video_page)
3522
3523
3524         # Extract video links on video page
3525         """Extract video links of all sizes"""
3526         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3527         mobj = re.findall(pattern, webpage)
3528         if len(mobj) == 0:
3529             self._downloader.report_error(u'unable to extract video links')
3530
3531         # Sort in resolution
3532         links = sorted(mobj)
3533
3534         # Choose the lowest of the sort, i.e. highest resolution
3535         video_url = links[-1]
3536         # Only get the url. The resolution part in the tuple has no use anymore
3537         video_url = video_url[-1]
3538         # Treat escaped \u0026 style hex
3539         try:
3540             video_url = video_url.decode("unicode_escape")
3541         except AttributeError: # Python 3
3542             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3543
3544
3545         return [{
3546             'id':       video_id,
3547             'url':      video_url,
3548             'uploader': uploader,
3549             'upload_date':  upload_date,
3550             'title':    video_title,
3551             'ext':      video_extension,
3552         }]
3553
3554 class NBAIE(InfoExtractor):
3555     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3556     IE_NAME = u'nba'
3557
3558     def _real_extract(self, url):
3559         mobj = re.match(self._VALID_URL, url)
3560         if mobj is None:
3561             self._downloader.report_error(u'invalid URL: %s' % url)
3562             return
3563
3564         video_id = mobj.group(1)
3565         if video_id.endswith('/index.html'):
3566             video_id = video_id[:-len('/index.html')]
3567
3568         webpage = self._download_webpage(url, video_id)
3569
3570         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3571         def _findProp(rexp, default=None):
3572             m = re.search(rexp, webpage)
3573             if m:
3574                 return unescapeHTML(m.group(1))
3575             else:
3576                 return default
3577
3578         shortened_video_id = video_id.rpartition('/')[2]
3579         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3580         info = {
3581             'id': shortened_video_id,
3582             'url': video_url,
3583             'ext': 'mp4',
3584             'title': title,
3585             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3586             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3587         }
3588         return [info]
3589
3590 class JustinTVIE(InfoExtractor):
3591     """Information extractor for justin.tv and twitch.tv"""
3592     # TODO: One broadcast may be split into multiple videos. The key
3593     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3594     # starts at 1 and increases. Can we treat all parts as one video?
3595
3596     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3597         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3598     _JUSTIN_PAGE_LIMIT = 100
3599     IE_NAME = u'justin.tv'
3600
3601     def report_extraction(self, file_id):
3602         """Report information extraction."""
3603         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3604
3605     def report_download_page(self, channel, offset):
3606         """Report attempt to download a single page of videos."""
3607         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3608                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3609
3610     # Return count of items, list of *valid* items
3611     def _parse_page(self, url):
3612         try:
3613             urlh = compat_urllib_request.urlopen(url)
3614             webpage_bytes = urlh.read()
3615             webpage = webpage_bytes.decode('utf-8', 'ignore')
3616         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3617             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3618             return
3619
3620         response = json.loads(webpage)
3621         if type(response) != list:
3622             error_text = response.get('error', 'unknown error')
3623             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3624             return
3625         info = []
3626         for clip in response:
3627             video_url = clip['video_file_url']
3628             if video_url:
3629                 video_extension = os.path.splitext(video_url)[1][1:]
3630                 video_date = re.sub('-', '', clip['start_time'][:10])
3631                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3632                 video_id = clip['id']
3633                 video_title = clip.get('title', video_id)
3634                 info.append({
3635                     'id': video_id,
3636                     'url': video_url,
3637                     'title': video_title,
3638                     'uploader': clip.get('channel_name', video_uploader_id),
3639                     'uploader_id': video_uploader_id,
3640                     'upload_date': video_date,
3641                     'ext': video_extension,
3642                 })
3643         return (len(response), info)
3644
3645     def _real_extract(self, url):
3646         mobj = re.match(self._VALID_URL, url)
3647         if mobj is None:
3648             self._downloader.report_error(u'invalid URL: %s' % url)
3649             return
3650
3651         api = 'http://api.justin.tv'
3652         video_id = mobj.group(mobj.lastindex)
3653         paged = False
3654         if mobj.lastindex == 1:
3655             paged = True
3656             api += '/channel/archives/%s.json'
3657         else:
3658             api += '/broadcast/by_archive/%s.json'
3659         api = api % (video_id,)
3660
3661         self.report_extraction(video_id)
3662
3663         info = []
3664         offset = 0
3665         limit = self._JUSTIN_PAGE_LIMIT
3666         while True:
3667             if paged:
3668                 self.report_download_page(video_id, offset)
3669             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3670             page_count, page_info = self._parse_page(page_url)
3671             info.extend(page_info)
3672             if not paged or page_count != limit:
3673                 break
3674             offset += limit
3675         return info
3676
3677 class FunnyOrDieIE(InfoExtractor):
3678     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3679
3680     def _real_extract(self, url):
3681         mobj = re.match(self._VALID_URL, url)
3682         if mobj is None:
3683             self._downloader.report_error(u'invalid URL: %s' % url)
3684             return
3685
3686         video_id = mobj.group('id')
3687         webpage = self._download_webpage(url, video_id)
3688
3689         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3690         if not m:
3691             self._downloader.report_error(u'unable to find video information')
3692         video_url = unescapeHTML(m.group('url'))
3693
3694         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3695         if not m:
3696             self._downloader.trouble(u'Cannot find video title')
3697         title = clean_html(m.group('title'))
3698
3699         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3700         if m:
3701             desc = unescapeHTML(m.group('desc'))
3702         else:
3703             desc = None
3704
3705         info = {
3706             'id': video_id,
3707             'url': video_url,
3708             'ext': 'mp4',
3709             'title': title,
3710             'description': desc,
3711         }
3712         return [info]
3713
3714 class SteamIE(InfoExtractor):
3715     _VALID_URL = r"""http://store.steampowered.com/
3716                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3717                 (?P<gameID>\d+)/?
3718                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3719                 """
3720
3721     @classmethod
3722     def suitable(cls, url):
3723         """Receives a URL and returns True if suitable for this IE."""
3724         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3725
3726     def _real_extract(self, url):
3727         m = re.match(self._VALID_URL, url, re.VERBOSE)
3728         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3729         gameID = m.group('gameID')
3730         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3731         webpage = self._download_webpage(videourl, gameID)
3732         mweb = re.finditer(urlRE, webpage)
3733         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3734         titles = re.finditer(namesRE, webpage)
3735         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3736         thumbs = re.finditer(thumbsRE, webpage)
3737         videos = []
3738         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3739             video_id = vid.group('videoID')
3740             title = vtitle.group('videoName')
3741             video_url = vid.group('videoURL')
3742             video_thumb = thumb.group('thumbnail')
3743             if not video_url:
3744                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3745             info = {
3746                 'id':video_id,
3747                 'url':video_url,
3748                 'ext': 'flv',
3749                 'title': unescapeHTML(title),
3750                 'thumbnail': video_thumb
3751                   }
3752             videos.append(info)
3753         return videos
3754
3755 class UstreamIE(InfoExtractor):
3756     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3757     IE_NAME = u'ustream'
3758
3759     def _real_extract(self, url):
3760         m = re.match(self._VALID_URL, url)
3761         video_id = m.group('videoID')
3762         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3763         webpage = self._download_webpage(url, video_id)
3764         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3765         title = m.group('title')
3766         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3767         uploader = m.group('uploader')
3768         info = {
3769                 'id':video_id,
3770                 'url':video_url,
3771                 'ext': 'flv',
3772                 'title': title,
3773                 'uploader': uploader
3774                   }
3775         return [info]
3776
3777 class WorldStarHipHopIE(InfoExtractor):
3778     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3779     IE_NAME = u'WorldStarHipHop'
3780
3781     def _real_extract(self, url):
3782         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3783
3784         webpage_src = compat_urllib_request.urlopen(url).read()
3785         webpage_src = webpage_src.decode('utf-8')
3786
3787         mobj = re.search(_src_url, webpage_src)
3788
3789         m = re.match(self._VALID_URL, url)
3790         video_id = m.group('id')
3791
3792         if mobj is not None:
3793             video_url = mobj.group()
3794             if 'mp4' in video_url:
3795                 ext = 'mp4'
3796             else:
3797                 ext = 'flv'
3798         else:
3799             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3800             return
3801
3802         _title = r"""<title>(.*)</title>"""
3803
3804         mobj = re.search(_title, webpage_src)
3805         
3806         if mobj is not None:
3807             title = mobj.group(1)
3808         else:
3809             title = 'World Start Hip Hop - %s' % time.ctime()
3810
3811         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3812         mobj = re.search(_thumbnail, webpage_src)
3813
3814         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3815         if mobj is not None:
3816             thumbnail = mobj.group(1)
3817         else:
3818             _title = r"""candytitles.*>(.*)</span>"""
3819             mobj = re.search(_title, webpage_src)
3820             if mobj is not None:
3821                 title = mobj.group(1)
3822             thumbnail = None
3823         
3824         results = [{
3825                     'id': video_id,
3826                     'url' : video_url,
3827                     'title' : title,
3828                     'thumbnail' : thumbnail,
3829                     'ext' : ext,
3830                     }]
3831         return results
3832
3833 class RBMARadioIE(InfoExtractor):
3834     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3835
3836     def _real_extract(self, url):
3837         m = re.match(self._VALID_URL, url)
3838         video_id = m.group('videoID')
3839
3840         webpage = self._download_webpage(url, video_id)
3841         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3842         if not m:
3843             raise ExtractorError(u'Cannot find metadata')
3844         json_data = m.group(1)
3845
3846         try:
3847             data = json.loads(json_data)
3848         except ValueError as e:
3849             raise ExtractorError(u'Invalid JSON: ' + str(e))
3850
3851         video_url = data['akamai_url'] + '&cbr=256'
3852         url_parts = compat_urllib_parse_urlparse(video_url)
3853         video_ext = url_parts.path.rpartition('.')[2]
3854         info = {
3855                 'id': video_id,
3856                 'url': video_url,
3857                 'ext': video_ext,
3858                 'title': data['title'],
3859                 'description': data.get('teaser_text'),
3860                 'location': data.get('country_of_origin'),
3861                 'uploader': data.get('host', {}).get('name'),
3862                 'uploader_id': data.get('host', {}).get('slug'),
3863                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3864                 'duration': data.get('duration'),
3865         }
3866         return [info]
3867
3868
3869 class YouPornIE(InfoExtractor):
3870     """Information extractor for youporn.com."""
3871     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3872
3873     def _print_formats(self, formats):
3874         """Print all available formats"""
3875         print(u'Available formats:')
3876         print(u'ext\t\tformat')
3877         print(u'---------------------------------')
3878         for format in formats:
3879             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3880
3881     def _specific(self, req_format, formats):
3882         for x in formats:
3883             if(x["format"]==req_format):
3884                 return x
3885         return None
3886
3887     def _real_extract(self, url):
3888         mobj = re.match(self._VALID_URL, url)
3889         if mobj is None:
3890             self._downloader.report_error(u'invalid URL: %s' % url)
3891             return
3892
3893         video_id = mobj.group('videoid')
3894
3895         req = compat_urllib_request.Request(url)
3896         req.add_header('Cookie', 'age_verified=1')
3897         webpage = self._download_webpage(req, video_id)
3898
3899         # Get the video title
3900         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3901         if result is None:
3902             raise ExtractorError(u'Unable to extract video title')
3903         video_title = result.group('title').strip()
3904
3905         # Get the video date
3906         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3907         if result is None:
3908             self._downloader.report_warning(u'unable to extract video date')
3909             upload_date = None
3910         else:
3911             upload_date = result.group('date').strip()
3912
3913         # Get the video uploader
3914         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3915         if result is None:
3916             self._downloader.report_warning(u'unable to extract uploader')
3917             video_uploader = None
3918         else:
3919             video_uploader = result.group('uploader').strip()
3920             video_uploader = clean_html( video_uploader )
3921
3922         # Get all of the formats available
3923         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3924         result = re.search(DOWNLOAD_LIST_RE, webpage)
3925         if result is None:
3926             raise ExtractorError(u'Unable to extract download list')
3927         download_list_html = result.group('download_list').strip()
3928
3929         # Get all of the links from the page
3930         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3931         links = re.findall(LINK_RE, download_list_html)
3932         if(len(links) == 0):
3933             raise ExtractorError(u'ERROR: no known formats available for video')
3934
3935         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3936
3937         formats = []
3938         for link in links:
3939
3940             # A link looks like this:
3941             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3942             # A path looks like this:
3943             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3944             video_url = unescapeHTML( link )
3945             path = compat_urllib_parse_urlparse( video_url ).path
3946             extension = os.path.splitext( path )[1][1:]
3947             format = path.split('/')[4].split('_')[:2]
3948             size = format[0]
3949             bitrate = format[1]
3950             format = "-".join( format )
3951             title = u'%s-%s-%s' % (video_title, size, bitrate)
3952
3953             formats.append({
3954                 'id': video_id,
3955                 'url': video_url,
3956                 'uploader': video_uploader,
3957                 'upload_date': upload_date,
3958                 'title': title,
3959                 'ext': extension,
3960                 'format': format,
3961                 'thumbnail': None,
3962                 'description': None,
3963                 'player_url': None
3964             })
3965
3966         if self._downloader.params.get('listformats', None):
3967             self._print_formats(formats)
3968             return
3969
3970         req_format = self._downloader.params.get('format', None)
3971         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3972
3973         if req_format is None or req_format == 'best':
3974             return [formats[0]]
3975         elif req_format == 'worst':
3976             return [formats[-1]]
3977         elif req_format in ('-1', 'all'):
3978             return formats
3979         else:
3980             format = self._specific( req_format, formats )
3981             if result is None:
3982                 self._downloader.report_error(u'requested format not available')
3983                 return
3984             return [format]
3985
3986
3987
3988 class PornotubeIE(InfoExtractor):
3989     """Information extractor for pornotube.com."""
3990     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3991
3992     def _real_extract(self, url):
3993         mobj = re.match(self._VALID_URL, url)
3994         if mobj is None:
3995             self._downloader.report_error(u'invalid URL: %s' % url)
3996             return
3997
3998         video_id = mobj.group('videoid')
3999         video_title = mobj.group('title')
4000
4001         # Get webpage content
4002         webpage = self._download_webpage(url, video_id)
4003
4004         # Get the video URL
4005         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4006         result = re.search(VIDEO_URL_RE, webpage)
4007         if result is None:
4008             self._downloader.report_error(u'unable to extract video url')
4009             return
4010         video_url = compat_urllib_parse.unquote(result.group('url'))
4011
4012         #Get the uploaded date
4013         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4014         result = re.search(VIDEO_UPLOADED_RE, webpage)
4015         if result is None:
4016             self._downloader.report_error(u'unable to extract video title')
4017             return
4018         upload_date = result.group('date')
4019
4020         info = {'id': video_id,
4021                 'url': video_url,
4022                 'uploader': None,
4023                 'upload_date': upload_date,
4024                 'title': video_title,
4025                 'ext': 'flv',
4026                 'format': 'flv'}
4027
4028         return [info]
4029
4030 class YouJizzIE(InfoExtractor):
4031     """Information extractor for youjizz.com."""
4032     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4033
4034     def _real_extract(self, url):
4035         mobj = re.match(self._VALID_URL, url)
4036         if mobj is None:
4037             self._downloader.report_error(u'invalid URL: %s' % url)
4038             return
4039
4040         video_id = mobj.group('videoid')
4041
4042         # Get webpage content
4043         webpage = self._download_webpage(url, video_id)
4044
4045         # Get the video title
4046         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4047         if result is None:
4048             raise ExtractorError(u'ERROR: unable to extract video title')
4049         video_title = result.group('title').strip()
4050
4051         # Get the embed page
4052         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4053         if result is None:
4054             raise ExtractorError(u'ERROR: unable to extract embed page')
4055
4056         embed_page_url = result.group(0).strip()
4057         video_id = result.group('videoid')
4058
4059         webpage = self._download_webpage(embed_page_url, video_id)
4060
4061         # Get the video URL
4062         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4063         if result is None:
4064             raise ExtractorError(u'ERROR: unable to extract video url')
4065         video_url = result.group('source')
4066
4067         info = {'id': video_id,
4068                 'url': video_url,
4069                 'title': video_title,
4070                 'ext': 'flv',
4071                 'format': 'flv',
4072                 'player_url': embed_page_url}
4073
4074         return [info]
4075
4076 class EightTracksIE(InfoExtractor):
4077     IE_NAME = '8tracks'
4078     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4079
4080     def _real_extract(self, url):
4081         mobj = re.match(self._VALID_URL, url)
4082         if mobj is None:
4083             raise ExtractorError(u'Invalid URL: %s' % url)
4084         playlist_id = mobj.group('id')
4085
4086         webpage = self._download_webpage(url, playlist_id)
4087
4088         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4089         if not m:
4090             raise ExtractorError(u'Cannot find trax information')
4091         json_like = m.group(1)
4092         data = json.loads(json_like)
4093
4094         session = str(random.randint(0, 1000000000))
4095         mix_id = data['id']
4096         track_count = data['tracks_count']
4097         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4098         next_url = first_url
4099         res = []
4100         for i in itertools.count():
4101             api_json = self._download_webpage(next_url, playlist_id,
4102                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4103                 errnote=u'Failed to download song information')
4104             api_data = json.loads(api_json)
4105             track_data = api_data[u'set']['track']
4106             info = {
4107                 'id': track_data['id'],
4108                 'url': track_data['track_file_stream_url'],
4109                 'title': track_data['performer'] + u' - ' + track_data['name'],
4110                 'raw_title': track_data['name'],
4111                 'uploader_id': data['user']['login'],
4112                 'ext': 'm4a',
4113             }
4114             res.append(info)
4115             if api_data['set']['at_last_track']:
4116                 break
4117             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4118         return res
4119
4120 class KeekIE(InfoExtractor):
4121     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4122     IE_NAME = u'keek'
4123
4124     def _real_extract(self, url):
4125         m = re.match(self._VALID_URL, url)
4126         video_id = m.group('videoID')
4127         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4128         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4129         webpage = self._download_webpage(url, video_id)
4130         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4131         title = unescapeHTML(m.group('title'))
4132         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4133         uploader = clean_html(m.group('uploader'))
4134         info = {
4135                 'id': video_id,
4136                 'url': video_url,
4137                 'ext': 'mp4',
4138                 'title': title,
4139                 'thumbnail': thumbnail,
4140                 'uploader': uploader
4141         }
4142         return [info]
4143
4144 class TEDIE(InfoExtractor):
4145     _VALID_URL=r'''http://www.ted.com/
4146                    (
4147                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4148                         |
4149                         ((?P<type_talk>talks)) # We have a simple talk
4150                    )
4151                    /(?P<name>\w+) # Here goes the name and then ".html"
4152                    '''
4153
4154     @classmethod
4155     def suitable(cls, url):
4156         """Receives a URL and returns True if suitable for this IE."""
4157         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4158
4159     def _real_extract(self, url):
4160         m=re.match(self._VALID_URL, url, re.VERBOSE)
4161         if m.group('type_talk'):
4162             return [self._talk_info(url)]
4163         else :
4164             playlist_id=m.group('playlist_id')
4165             name=m.group('name')
4166             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4167             return self._playlist_videos_info(url,name,playlist_id)
4168
4169     def _talk_video_link(self,mediaSlug):
4170         '''Returns the video link for that mediaSlug'''
4171         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4172
4173     def _playlist_videos_info(self,url,name,playlist_id=0):
4174         '''Returns the videos of the playlist'''
4175         video_RE=r'''
4176                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4177                      ([.\s]*?)data-playlist_item_id="(\d+)"
4178                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4179                      '''
4180         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4181         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4182         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4183         m_names=re.finditer(video_name_RE,webpage)
4184         info=[]
4185         for m_video, m_name in zip(m_videos,m_names):
4186             video_id=m_video.group('video_id')
4187             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4188             info.append(self._talk_info(talk_url,video_id))
4189         return info
4190
4191     def _talk_info(self, url, video_id=0):
4192         """Return the video for the talk in the url"""
4193         m=re.match(self._VALID_URL, url,re.VERBOSE)
4194         videoName=m.group('name')
4195         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4196         # If the url includes the language we get the title translated
4197         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4198         title=re.search(title_RE, webpage).group('title')
4199         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4200                         "id":(?P<videoID>[\d]+).*?
4201                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4202         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4203         thumb_match=re.search(thumb_RE,webpage)
4204         info_match=re.search(info_RE,webpage,re.VERBOSE)
4205         video_id=info_match.group('videoID')
4206         mediaSlug=info_match.group('mediaSlug')
4207         video_url=self._talk_video_link(mediaSlug)
4208         info = {
4209                 'id': video_id,
4210                 'url': video_url,
4211                 'ext': 'mp4',
4212                 'title': title,
4213                 'thumbnail': thumb_match.group('thumbnail')
4214                 }
4215         return info
4216
4217 class MySpassIE(InfoExtractor):
4218     _VALID_URL = r'http://www.myspass.de/.*'
4219
4220     def _real_extract(self, url):
4221         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4222
4223         # video id is the last path element of the URL
4224         # usually there is a trailing slash, so also try the second but last
4225         url_path = compat_urllib_parse_urlparse(url).path
4226         url_parent_path, video_id = os.path.split(url_path)
4227         if not video_id:
4228             _, video_id = os.path.split(url_parent_path)
4229
4230         # get metadata
4231         metadata_url = META_DATA_URL_TEMPLATE % video_id
4232         metadata_text = self._download_webpage(metadata_url, video_id)
4233         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4234
4235         # extract values from metadata
4236         url_flv_el = metadata.find('url_flv')
4237         if url_flv_el is None:
4238             self._downloader.report_error(u'unable to extract download url')
4239             return
4240         video_url = url_flv_el.text
4241         extension = os.path.splitext(video_url)[1][1:]
4242         title_el = metadata.find('title')
4243         if title_el is None:
4244             self._downloader.report_error(u'unable to extract title')
4245             return
4246         title = title_el.text
4247         format_id_el = metadata.find('format_id')
4248         if format_id_el is None:
4249             format = ext
4250         else:
4251             format = format_id_el.text
4252         description_el = metadata.find('description')
4253         if description_el is not None:
4254             description = description_el.text
4255         else:
4256             description = None
4257         imagePreview_el = metadata.find('imagePreview')
4258         if imagePreview_el is not None:
4259             thumbnail = imagePreview_el.text
4260         else:
4261             thumbnail = None
4262         info = {
4263             'id': video_id,
4264             'url': video_url,
4265             'title': title,
4266             'ext': extension,
4267             'format': format,
4268             'thumbnail': thumbnail,
4269             'description': description
4270         }
4271         return [info]
4272
4273 class SpiegelIE(InfoExtractor):
4274     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4275
4276     def _real_extract(self, url):
4277         m = re.match(self._VALID_URL, url)
4278         video_id = m.group('videoID')
4279
4280         webpage = self._download_webpage(url, video_id)
4281         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4282         if not m:
4283             raise ExtractorError(u'Cannot find title')
4284         video_title = unescapeHTML(m.group(1))
4285
4286         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4287         xml_code = self._download_webpage(xml_url, video_id,
4288                     note=u'Downloading XML', errnote=u'Failed to download XML')
4289
4290         idoc = xml.etree.ElementTree.fromstring(xml_code)
4291         last_type = idoc[-1]
4292         filename = last_type.findall('./filename')[0].text
4293         duration = float(last_type.findall('./duration')[0].text)
4294
4295         video_url = 'http://video2.spiegel.de/flash/' + filename
4296         video_ext = filename.rpartition('.')[2]
4297         info = {
4298             'id': video_id,
4299             'url': video_url,
4300             'ext': video_ext,
4301             'title': video_title,
4302             'duration': duration,
4303         }
4304         return [info]
4305
4306 class LiveLeakIE(InfoExtractor):
4307
4308     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4309     IE_NAME = u'liveleak'
4310
4311     def _real_extract(self, url):
4312         mobj = re.match(self._VALID_URL, url)
4313         if mobj is None:
4314             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4315             return
4316
4317         video_id = mobj.group('video_id')
4318
4319         webpage = self._download_webpage(url, video_id)
4320
4321         m = re.search(r'file: "(.*?)",', webpage)
4322         if not m:
4323             self._downloader.report_error(u'unable to find video url')
4324             return
4325         video_url = m.group(1)
4326
4327         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4328         if not m:
4329             self._downloader.trouble(u'Cannot find video title')
4330         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4331
4332         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4333         if m:
4334             desc = unescapeHTML(m.group('desc'))
4335         else:
4336             desc = None
4337
4338         m = re.search(r'By:.*?(\w+)</a>', webpage)
4339         if m:
4340             uploader = clean_html(m.group(1))
4341         else:
4342             uploader = None
4343
4344         info = {
4345             'id':  video_id,
4346             'url': video_url,
4347             'ext': 'mp4',
4348             'title': title,
4349             'description': desc,
4350             'uploader': uploader
4351         }
4352
4353         return [info]
4354
4355
4356 def gen_extractors():
4357     """ Return a list of an instance of every supported extractor.
4358     The order does matter; the first extractor matched is the one handling the URL.
4359     """
4360     return [
4361         YoutubePlaylistIE(),
4362         YoutubeChannelIE(),
4363         YoutubeUserIE(),
4364         YoutubeSearchIE(),
4365         YoutubeIE(),
4366         MetacafeIE(),
4367         DailymotionIE(),
4368         GoogleSearchIE(),
4369         PhotobucketIE(),
4370         YahooIE(),
4371         YahooSearchIE(),
4372         DepositFilesIE(),
4373         FacebookIE(),
4374         BlipTVUserIE(),
4375         BlipTVIE(),
4376         VimeoIE(),
4377         MyVideoIE(),
4378         ComedyCentralIE(),
4379         EscapistIE(),
4380         CollegeHumorIE(),
4381         XVideosIE(),
4382         SoundcloudSetIE(),
4383         SoundcloudIE(),
4384         InfoQIE(),
4385         MixcloudIE(),
4386         StanfordOpenClassroomIE(),
4387         MTVIE(),
4388         YoukuIE(),
4389         XNXXIE(),
4390         YouJizzIE(),
4391         PornotubeIE(),
4392         YouPornIE(),
4393         GooglePlusIE(),
4394         ArteTvIE(),
4395         NBAIE(),
4396         WorldStarHipHopIE(),
4397         JustinTVIE(),
4398         FunnyOrDieIE(),
4399         SteamIE(),
4400         UstreamIE(),
4401         RBMARadioIE(),
4402         EightTracksIE(),
4403         KeekIE(),
4404         TEDIE(),
4405         MySpassIE(),
4406         SpiegelIE(),
4407         LiveLeakIE(),
4408         GenericIE()
4409     ]