Merge pull request #730 by @JohnyMoSwag
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
228
229     def report_video_subtitles_request(self, video_id, sub_lang, format):
230         """Report attempt to download video info webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
232
233     def report_video_subtitles_available(self, video_id, sub_lang_list):
234         """Report available subtitles."""
235         sub_lang = ",".join(list(sub_lang_list.keys()))
236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
237
238     def report_information_extraction(self, video_id):
239         """Report attempt to extract video information."""
240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
241
242     def report_unavailable_format(self, video_id, format):
243         """Report extracted video URL."""
244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
245
246     def report_rtmp_download(self):
247         """Indicate the download will use the RTMP protocol."""
248         self._downloader.to_screen(u'[youtube] RTMP download detected')
249
250     def _get_available_subtitles(self, video_id):
251         self.report_video_subtitles_download(video_id)
252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253         try:
254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259         if not sub_lang_list:
260             return (u'WARNING: video doesn\'t have subtitles', None)
261         return sub_lang_list
262
263     def _list_available_subtitles(self, video_id):
264         sub_lang_list = self._get_available_subtitles(video_id)
265         self.report_video_subtitles_available(video_id, sub_lang_list)
266
267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268         self.report_video_subtitles_request(video_id, sub_lang, format)
269         params = compat_urllib_parse.urlencode({
270             'lang': sub_lang,
271             'name': sub_name,
272             'v': video_id,
273             'fmt': format,
274         })
275         url = 'http://www.youtube.com/api/timedtext?' + params
276         try:
277             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
280         if not sub:
281             return (u'WARNING: Did not fetch video subtitles', None)
282         return (None, sub_lang, sub)
283
284     def _extract_subtitle(self, video_id):
285         sub_lang_list = self._get_available_subtitles(video_id)
286         sub_format = self._downloader.params.get('subtitlesformat')
287         if self._downloader.params.get('subtitleslang', False):
288             sub_lang = self._downloader.params.get('subtitleslang')
289         elif 'en' in sub_lang_list:
290             sub_lang = 'en'
291         else:
292             sub_lang = list(sub_lang_list.keys())[0]
293         if not sub_lang in sub_lang_list:
294             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
295
296         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
297         return [subtitle]
298
299     def _extract_all_subtitles(self, video_id):
300         sub_lang_list = self._get_available_subtitles(video_id)
301         sub_format = self._downloader.params.get('subtitlesformat')
302         subtitles = []
303         for sub_lang in sub_lang_list:
304             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
305             subtitles.append(subtitle)
306         return subtitles
307
308     def _print_formats(self, formats):
309         print('Available formats:')
310         for x in formats:
311             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
312
313     def _real_initialize(self):
314         if self._downloader is None:
315             return
316
317         username = None
318         password = None
319         downloader_params = self._downloader.params
320
321         # Attempt to use provided username and password or .netrc data
322         if downloader_params.get('username', None) is not None:
323             username = downloader_params['username']
324             password = downloader_params['password']
325         elif downloader_params.get('usenetrc', False):
326             try:
327                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
328                 if info is not None:
329                     username = info[0]
330                     password = info[2]
331                 else:
332                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
333             except (IOError, netrc.NetrcParseError) as err:
334                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
335                 return
336
337         # Set language
338         request = compat_urllib_request.Request(self._LANG_URL)
339         try:
340             self.report_lang()
341             compat_urllib_request.urlopen(request).read()
342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
344             return
345
346         # No authentication to be performed
347         if username is None:
348             return
349
350         request = compat_urllib_request.Request(self._LOGIN_URL)
351         try:
352             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
354             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
355             return
356
357         galx = None
358         dsh = None
359         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
360         if match:
361           galx = match.group(1)
362
363         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
364         if match:
365           dsh = match.group(1)
366
367         # Log in
368         login_form_strs = {
369                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
370                 u'Email': username,
371                 u'GALX': galx,
372                 u'Passwd': password,
373                 u'PersistentCookie': u'yes',
374                 u'_utf8': u'霱',
375                 u'bgresponse': u'js_disabled',
376                 u'checkConnection': u'',
377                 u'checkedDomains': u'youtube',
378                 u'dnConn': u'',
379                 u'dsh': dsh,
380                 u'pstMsg': u'0',
381                 u'rmShown': u'1',
382                 u'secTok': u'',
383                 u'signIn': u'Sign in',
384                 u'timeStmp': u'',
385                 u'service': u'youtube',
386                 u'uilel': u'3',
387                 u'hl': u'en_US',
388         }
389         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
390         # chokes on unicode
391         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
392         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
393         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
394         try:
395             self.report_login()
396             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
397             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
398                 self._downloader.report_warning(u'unable to log in: bad username or password')
399                 return
400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
401             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
402             return
403
404         # Confirm age
405         age_form = {
406                 'next_url':     '/',
407                 'action_confirm':   'Confirm',
408                 }
409         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
410         try:
411             self.report_age_confirmation()
412             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
415             return
416
417     def _extract_id(self, url):
418         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
419         if mobj is None:
420             self._downloader.report_error(u'invalid URL: %s' % url)
421             return
422         video_id = mobj.group(2)
423         return video_id
424
425     def _real_extract(self, url):
426         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
427         mobj = re.search(self._NEXT_URL_RE, url)
428         if mobj:
429             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
430         video_id = self._extract_id(url)
431
432         # Get video webpage
433         self.report_video_webpage_download(video_id)
434         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
435         request = compat_urllib_request.Request(url)
436         try:
437             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
438         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
439             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
440             return
441
442         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
443
444         # Attempt to extract SWF player URL
445         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
446         if mobj is not None:
447             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
448         else:
449             player_url = None
450
451         # Get video info
452         self.report_video_info_webpage_download(video_id)
453         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
454             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
455                     % (video_id, el_type))
456             request = compat_urllib_request.Request(video_info_url)
457             try:
458                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
459                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
460                 video_info = compat_parse_qs(video_info_webpage)
461                 if 'token' in video_info:
462                     break
463             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
464                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
465                 return
466         if 'token' not in video_info:
467             if 'reason' in video_info:
468                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
469             else:
470                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
471             return
472
473         # Check for "rental" videos
474         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
475             self._downloader.report_error(u'"rental" videos not supported')
476             return
477
478         # Start extracting information
479         self.report_information_extraction(video_id)
480
481         # uploader
482         if 'author' not in video_info:
483             self._downloader.report_error(u'unable to extract uploader name')
484             return
485         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
486
487         # uploader_id
488         video_uploader_id = None
489         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
490         if mobj is not None:
491             video_uploader_id = mobj.group(1)
492         else:
493             self._downloader.report_warning(u'unable to extract uploader nickname')
494
495         # title
496         if 'title' not in video_info:
497             self._downloader.report_error(u'unable to extract video title')
498             return
499         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
500
501         # thumbnail image
502         if 'thumbnail_url' not in video_info:
503             self._downloader.report_warning(u'unable to extract video thumbnail')
504             video_thumbnail = ''
505         else:   # don't panic if we can't find it
506             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
507
508         # upload date
509         upload_date = None
510         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
511         if mobj is not None:
512             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
513             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
514             for expression in format_expressions:
515                 try:
516                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
517                 except:
518                     pass
519
520         # description
521         video_description = get_element_by_id("eow-description", video_webpage)
522         if video_description:
523             video_description = clean_html(video_description)
524         else:
525             video_description = ''
526
527         # subtitles
528         video_subtitles = None
529
530         if self._downloader.params.get('writesubtitles', False):
531             video_subtitles = self._extract_subtitle(video_id)
532             if video_subtitles:
533                 (sub_error, sub_lang, sub) = video_subtitles[0]
534                 if sub_error:
535                     self._downloader.trouble(sub_error)
536
537         if self._downloader.params.get('allsubtitles', False):
538             video_subtitles = self._extract_all_subtitles(video_id)
539             for video_subtitle in video_subtitles:
540                 (sub_error, sub_lang, sub) = video_subtitle
541                 if sub_error:
542                     self._downloader.trouble(sub_error)
543
544         if self._downloader.params.get('listsubtitles', False):
545             sub_lang_list = self._list_available_subtitles(video_id)
546             return
547
548         if 'length_seconds' not in video_info:
549             self._downloader.report_warning(u'unable to extract video duration')
550             video_duration = ''
551         else:
552             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
553
554         # token
555         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
556
557         # Decide which formats to download
558         req_format = self._downloader.params.get('format', None)
559
560         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
561             self.report_rtmp_download()
562             video_url_list = [(None, video_info['conn'][0])]
563         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
564             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
565             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
566             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
567             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
568
569             format_limit = self._downloader.params.get('format_limit', None)
570             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
571             if format_limit is not None and format_limit in available_formats:
572                 format_list = available_formats[available_formats.index(format_limit):]
573             else:
574                 format_list = available_formats
575             existing_formats = [x for x in format_list if x in url_map]
576             if len(existing_formats) == 0:
577                 self._downloader.report_error(u'no known formats available for video')
578                 return
579             if self._downloader.params.get('listformats', None):
580                 self._print_formats(existing_formats)
581                 return
582             if req_format is None or req_format == 'best':
583                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
584             elif req_format == 'worst':
585                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
586             elif req_format in ('-1', 'all'):
587                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
588             else:
589                 # Specific formats. We pick the first in a slash-delimeted sequence.
590                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
591                 req_formats = req_format.split('/')
592                 video_url_list = None
593                 for rf in req_formats:
594                     if rf in url_map:
595                         video_url_list = [(rf, url_map[rf])]
596                         break
597                 if video_url_list is None:
598                     self._downloader.report_error(u'requested format not available')
599                     return
600         else:
601             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
602             return
603
604         results = []
605         for format_param, video_real_url in video_url_list:
606             # Extension
607             video_extension = self._video_extensions.get(format_param, 'flv')
608
609             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
610                                               self._video_dimensions.get(format_param, '???'))
611
612             results.append({
613                 'id':       video_id,
614                 'url':      video_real_url,
615                 'uploader': video_uploader,
616                 'uploader_id': video_uploader_id,
617                 'upload_date':  upload_date,
618                 'title':    video_title,
619                 'ext':      video_extension,
620                 'format':   video_format,
621                 'thumbnail':    video_thumbnail,
622                 'description':  video_description,
623                 'player_url':   player_url,
624                 'subtitles':    video_subtitles,
625                 'duration':     video_duration
626             })
627         return results
628
629
630 class MetacafeIE(InfoExtractor):
631     """Information Extractor for metacafe.com."""
632
633     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
634     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
635     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
636     IE_NAME = u'metacafe'
637
638     def __init__(self, downloader=None):
639         InfoExtractor.__init__(self, downloader)
640
641     def report_disclaimer(self):
642         """Report disclaimer retrieval."""
643         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
644
645     def report_age_confirmation(self):
646         """Report attempt to confirm age."""
647         self._downloader.to_screen(u'[metacafe] Confirming age')
648
649     def report_download_webpage(self, video_id):
650         """Report webpage download."""
651         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
652
653     def report_extraction(self, video_id):
654         """Report information extraction."""
655         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
656
657     def _real_initialize(self):
658         # Retrieve disclaimer
659         request = compat_urllib_request.Request(self._DISCLAIMER)
660         try:
661             self.report_disclaimer()
662             disclaimer = compat_urllib_request.urlopen(request).read()
663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
665             return
666
667         # Confirm age
668         disclaimer_form = {
669             'filters': '0',
670             'submit': "Continue - I'm over 18",
671             }
672         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
673         try:
674             self.report_age_confirmation()
675             disclaimer = compat_urllib_request.urlopen(request).read()
676         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
677             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
678             return
679
680     def _real_extract(self, url):
681         # Extract id and simplified title from URL
682         mobj = re.match(self._VALID_URL, url)
683         if mobj is None:
684             self._downloader.report_error(u'invalid URL: %s' % url)
685             return
686
687         video_id = mobj.group(1)
688
689         # Check if video comes from YouTube
690         mobj2 = re.match(r'^yt-(.*)$', video_id)
691         if mobj2 is not None:
692             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
693             return
694
695         # Retrieve video webpage to extract further information
696         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
697         try:
698             self.report_download_webpage(video_id)
699             webpage = compat_urllib_request.urlopen(request).read()
700         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
701             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
702             return
703
704         # Extract URL, uploader and title from webpage
705         self.report_extraction(video_id)
706         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
707         if mobj is not None:
708             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
709             video_extension = mediaURL[-3:]
710
711             # Extract gdaKey if available
712             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
713             if mobj is None:
714                 video_url = mediaURL
715             else:
716                 gdaKey = mobj.group(1)
717                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
718         else:
719             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
720             if mobj is None:
721                 self._downloader.report_error(u'unable to extract media URL')
722                 return
723             vardict = compat_parse_qs(mobj.group(1))
724             if 'mediaData' not in vardict:
725                 self._downloader.report_error(u'unable to extract media URL')
726                 return
727             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
728             if mobj is None:
729                 self._downloader.report_error(u'unable to extract media URL')
730                 return
731             mediaURL = mobj.group(1).replace('\\/', '/')
732             video_extension = mediaURL[-3:]
733             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
734
735         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
736         if mobj is None:
737             self._downloader.report_error(u'unable to extract title')
738             return
739         video_title = mobj.group(1).decode('utf-8')
740
741         mobj = re.search(r'submitter=(.*?);', webpage)
742         if mobj is None:
743             self._downloader.report_error(u'unable to extract uploader nickname')
744             return
745         video_uploader = mobj.group(1)
746
747         return [{
748             'id':       video_id.decode('utf-8'),
749             'url':      video_url.decode('utf-8'),
750             'uploader': video_uploader.decode('utf-8'),
751             'upload_date':  None,
752             'title':    video_title,
753             'ext':      video_extension.decode('utf-8'),
754         }]
755
756
757 class DailymotionIE(InfoExtractor):
758     """Information Extractor for Dailymotion"""
759
760     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
761     IE_NAME = u'dailymotion'
762     _WORKING = False
763
764     def __init__(self, downloader=None):
765         InfoExtractor.__init__(self, downloader)
766
767     def report_extraction(self, video_id):
768         """Report information extraction."""
769         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
770
771     def _real_extract(self, url):
772         # Extract id and simplified title from URL
773         mobj = re.match(self._VALID_URL, url)
774         if mobj is None:
775             self._downloader.report_error(u'invalid URL: %s' % url)
776             return
777
778         video_id = mobj.group(1).split('_')[0].split('?')[0]
779
780         video_extension = 'mp4'
781
782         # Retrieve video webpage to extract further information
783         request = compat_urllib_request.Request(url)
784         request.add_header('Cookie', 'family_filter=off')
785         webpage = self._download_webpage(request, video_id)
786
787         # Extract URL, uploader and title from webpage
788         self.report_extraction(video_id)
789         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
790         if mobj is None:
791             self._downloader.report_error(u'unable to extract media URL')
792             return
793         flashvars = compat_urllib_parse.unquote(mobj.group(1))
794
795         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
796             if key in flashvars:
797                 max_quality = key
798                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
799                 break
800         else:
801             self._downloader.report_error(u'unable to extract video URL')
802             return
803
804         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
805         if mobj is None:
806             self._downloader.report_error(u'unable to extract video URL')
807             return
808
809         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
810
811         # TODO: support choosing qualities
812
813         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
814         if mobj is None:
815             self._downloader.report_error(u'unable to extract title')
816             return
817         video_title = unescapeHTML(mobj.group('title'))
818
819         video_uploader = None
820         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
821         if mobj is None:
822             # lookin for official user
823             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
824             if mobj_official is None:
825                 self._downloader.report_warning(u'unable to extract uploader nickname')
826             else:
827                 video_uploader = mobj_official.group(1)
828         else:
829             video_uploader = mobj.group(1)
830
831         video_upload_date = None
832         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
833         if mobj is not None:
834             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
835
836         return [{
837             'id':       video_id,
838             'url':      video_url,
839             'uploader': video_uploader,
840             'upload_date':  video_upload_date,
841             'title':    video_title,
842             'ext':      video_extension,
843         }]
844
845
846 class PhotobucketIE(InfoExtractor):
847     """Information extractor for photobucket.com."""
848
849     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
850     IE_NAME = u'photobucket'
851
852     def __init__(self, downloader=None):
853         InfoExtractor.__init__(self, downloader)
854
855     def report_download_webpage(self, video_id):
856         """Report webpage download."""
857         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
858
859     def report_extraction(self, video_id):
860         """Report information extraction."""
861         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
862
863     def _real_extract(self, url):
864         # Extract id from URL
865         mobj = re.match(self._VALID_URL, url)
866         if mobj is None:
867             self._downloader.report_error(u'Invalid URL: %s' % url)
868             return
869
870         video_id = mobj.group(1)
871
872         video_extension = 'flv'
873
874         # Retrieve video webpage to extract further information
875         request = compat_urllib_request.Request(url)
876         try:
877             self.report_download_webpage(video_id)
878             webpage = compat_urllib_request.urlopen(request).read()
879         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
880             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
881             return
882
883         # Extract URL, uploader, and title from webpage
884         self.report_extraction(video_id)
885         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
886         if mobj is None:
887             self._downloader.report_error(u'unable to extract media URL')
888             return
889         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
890
891         video_url = mediaURL
892
893         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
894         if mobj is None:
895             self._downloader.report_error(u'unable to extract title')
896             return
897         video_title = mobj.group(1).decode('utf-8')
898
899         video_uploader = mobj.group(2).decode('utf-8')
900
901         return [{
902             'id':       video_id.decode('utf-8'),
903             'url':      video_url.decode('utf-8'),
904             'uploader': video_uploader,
905             'upload_date':  None,
906             'title':    video_title,
907             'ext':      video_extension.decode('utf-8'),
908         }]
909
910
911 class YahooIE(InfoExtractor):
912     """Information extractor for video.yahoo.com."""
913
914     _WORKING = False
915     # _VALID_URL matches all Yahoo! Video URLs
916     # _VPAGE_URL matches only the extractable '/watch/' URLs
917     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919     IE_NAME = u'video.yahoo'
920
921     def __init__(self, downloader=None):
922         InfoExtractor.__init__(self, downloader)
923
924     def report_download_webpage(self, video_id):
925         """Report webpage download."""
926         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
927
928     def report_extraction(self, video_id):
929         """Report information extraction."""
930         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
931
932     def _real_extract(self, url, new_video=True):
933         # Extract ID from URL
934         mobj = re.match(self._VALID_URL, url)
935         if mobj is None:
936             self._downloader.report_error(u'Invalid URL: %s' % url)
937             return
938
939         video_id = mobj.group(2)
940         video_extension = 'flv'
941
942         # Rewrite valid but non-extractable URLs as
943         # extractable English language /watch/ URLs
944         if re.match(self._VPAGE_URL, url) is None:
945             request = compat_urllib_request.Request(url)
946             try:
947                 webpage = compat_urllib_request.urlopen(request).read()
948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
950                 return
951
952             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
953             if mobj is None:
954                 self._downloader.report_error(u'Unable to extract id field')
955                 return
956             yahoo_id = mobj.group(1)
957
958             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
959             if mobj is None:
960                 self._downloader.report_error(u'Unable to extract vid field')
961                 return
962             yahoo_vid = mobj.group(1)
963
964             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965             return self._real_extract(url, new_video=False)
966
967         # Retrieve video webpage to extract further information
968         request = compat_urllib_request.Request(url)
969         try:
970             self.report_download_webpage(video_id)
971             webpage = compat_urllib_request.urlopen(request).read()
972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
974             return
975
976         # Extract uploader and title from webpage
977         self.report_extraction(video_id)
978         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
979         if mobj is None:
980             self._downloader.report_error(u'unable to extract video title')
981             return
982         video_title = mobj.group(1).decode('utf-8')
983
984         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
985         if mobj is None:
986             self._downloader.report_error(u'unable to extract video uploader')
987             return
988         video_uploader = mobj.group(1).decode('utf-8')
989
990         # Extract video thumbnail
991         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
992         if mobj is None:
993             self._downloader.report_error(u'unable to extract video thumbnail')
994             return
995         video_thumbnail = mobj.group(1).decode('utf-8')
996
997         # Extract video description
998         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video description')
1001             return
1002         video_description = mobj.group(1).decode('utf-8')
1003         if not video_description:
1004             video_description = 'No description available.'
1005
1006         # Extract video height and width
1007         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video height')
1010             return
1011         yv_video_height = mobj.group(1)
1012
1013         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video width')
1016             return
1017         yv_video_width = mobj.group(1)
1018
1019         # Retrieve video playlist to extract media URL
1020         # I'm not completely sure what all these options are, but we
1021         # seem to need most of them, otherwise the server sends a 401.
1022         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1023         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1024         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1027         try:
1028             self.report_download_webpage(video_id)
1029             webpage = compat_urllib_request.urlopen(request).read()
1030         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1032             return
1033
1034         # Extract media URL from playlist XML
1035         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'Unable to extract media URL')
1038             return
1039         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040         video_url = unescapeHTML(video_url)
1041
1042         return [{
1043             'id':       video_id.decode('utf-8'),
1044             'url':      video_url,
1045             'uploader': video_uploader,
1046             'upload_date':  None,
1047             'title':    video_title,
1048             'ext':      video_extension.decode('utf-8'),
1049             'thumbnail':    video_thumbnail.decode('utf-8'),
1050             'description':  video_description,
1051         }]
1052
1053
1054 class VimeoIE(InfoExtractor):
1055     """Information extractor for vimeo.com."""
1056
1057     # _VALID_URL matches Vimeo URLs
1058     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1059     IE_NAME = u'vimeo'
1060
1061     def __init__(self, downloader=None):
1062         InfoExtractor.__init__(self, downloader)
1063
1064     def report_download_webpage(self, video_id):
1065         """Report webpage download."""
1066         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1067
1068     def report_extraction(self, video_id):
1069         """Report information extraction."""
1070         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1071
1072     def _real_extract(self, url, new_video=True):
1073         # Extract ID from URL
1074         mobj = re.match(self._VALID_URL, url)
1075         if mobj is None:
1076             self._downloader.report_error(u'Invalid URL: %s' % url)
1077             return
1078
1079         video_id = mobj.group('id')
1080         if not mobj.group('proto'):
1081             url = 'https://' + url
1082         if mobj.group('direct_link'):
1083             url = 'https://vimeo.com/' + video_id
1084
1085         # Retrieve video webpage to extract further information
1086         request = compat_urllib_request.Request(url, None, std_headers)
1087         try:
1088             self.report_download_webpage(video_id)
1089             webpage_bytes = compat_urllib_request.urlopen(request).read()
1090             webpage = webpage_bytes.decode('utf-8')
1091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1092             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1093             return
1094
1095         # Now we begin extracting as much information as we can from what we
1096         # retrieved. First we extract the information common to all extractors,
1097         # and latter we extract those that are Vimeo specific.
1098         self.report_extraction(video_id)
1099
1100         # Extract the config JSON
1101         try:
1102             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1103             config = json.loads(config)
1104         except:
1105             self._downloader.report_error(u'unable to extract info section')
1106             return
1107
1108         # Extract title
1109         video_title = config["video"]["title"]
1110
1111         # Extract uploader and uploader_id
1112         video_uploader = config["video"]["owner"]["name"]
1113         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1114
1115         # Extract video thumbnail
1116         video_thumbnail = config["video"]["thumbnail"]
1117
1118         # Extract video description
1119         video_description = get_element_by_attribute("itemprop", "description", webpage)
1120         if video_description: video_description = clean_html(video_description)
1121         else: video_description = ''
1122
1123         # Extract upload date
1124         video_upload_date = None
1125         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1126         if mobj is not None:
1127             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1128
1129         # Vimeo specific: extract request signature and timestamp
1130         sig = config['request']['signature']
1131         timestamp = config['request']['timestamp']
1132
1133         # Vimeo specific: extract video codec and quality information
1134         # First consider quality, then codecs, then take everything
1135         # TODO bind to format param
1136         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1137         files = { 'hd': [], 'sd': [], 'other': []}
1138         for codec_name, codec_extension in codecs:
1139             if codec_name in config["video"]["files"]:
1140                 if 'hd' in config["video"]["files"][codec_name]:
1141                     files['hd'].append((codec_name, codec_extension, 'hd'))
1142                 elif 'sd' in config["video"]["files"][codec_name]:
1143                     files['sd'].append((codec_name, codec_extension, 'sd'))
1144                 else:
1145                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1146
1147         for quality in ('hd', 'sd', 'other'):
1148             if len(files[quality]) > 0:
1149                 video_quality = files[quality][0][2]
1150                 video_codec = files[quality][0][0]
1151                 video_extension = files[quality][0][1]
1152                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1153                 break
1154         else:
1155             self._downloader.report_error(u'no known codec found')
1156             return
1157
1158         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1159                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1160
1161         return [{
1162             'id':       video_id,
1163             'url':      video_url,
1164             'uploader': video_uploader,
1165             'uploader_id': video_uploader_id,
1166             'upload_date':  video_upload_date,
1167             'title':    video_title,
1168             'ext':      video_extension,
1169             'thumbnail':    video_thumbnail,
1170             'description':  video_description,
1171         }]
1172
1173
1174 class ArteTvIE(InfoExtractor):
1175     """arte.tv information extractor."""
1176
1177     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1178     _LIVE_URL = r'index-[0-9]+\.html$'
1179
1180     IE_NAME = u'arte.tv'
1181
1182     def __init__(self, downloader=None):
1183         InfoExtractor.__init__(self, downloader)
1184
1185     def report_download_webpage(self, video_id):
1186         """Report webpage download."""
1187         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1188
1189     def report_extraction(self, video_id):
1190         """Report information extraction."""
1191         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1192
1193     def fetch_webpage(self, url):
1194         request = compat_urllib_request.Request(url)
1195         try:
1196             self.report_download_webpage(url)
1197             webpage = compat_urllib_request.urlopen(request).read()
1198         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1199             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1200             return
1201         except ValueError as err:
1202             self._downloader.report_error(u'Invalid URL: %s' % url)
1203             return
1204         return webpage
1205
1206     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1207         page = self.fetch_webpage(url)
1208         mobj = re.search(regex, page, regexFlags)
1209         info = {}
1210
1211         if mobj is None:
1212             self._downloader.report_error(u'Invalid URL: %s' % url)
1213             return
1214
1215         for (i, key, err) in matchTuples:
1216             if mobj.group(i) is None:
1217                 self._downloader.trouble(err)
1218                 return
1219             else:
1220                 info[key] = mobj.group(i)
1221
1222         return info
1223
1224     def extractLiveStream(self, url):
1225         video_lang = url.split('/')[-4]
1226         info = self.grep_webpage(
1227             url,
1228             r'src="(.*?/videothek_js.*?\.js)',
1229             0,
1230             [
1231                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1232             ]
1233         )
1234         http_host = url.split('/')[2]
1235         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239                 '(http://.*?\.swf).*?' +
1240                 '(rtmp://.*?)\'',
1241             re.DOTALL,
1242             [
1243                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1244                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1245                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1249
1250     def extractPlus7Stream(self, url):
1251         video_lang = url.split('/')[-3]
1252         info = self.grep_webpage(
1253             url,
1254             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1255             0,
1256             [
1257                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1258             ]
1259         )
1260         next_url = compat_urllib_parse.unquote(info.get('url'))
1261         info = self.grep_webpage(
1262             next_url,
1263             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1264             0,
1265             [
1266                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video id="(.*?)".*?>.*?' +
1274                 '<name>(.*?)</name>.*?' +
1275                 '<dateVideo>(.*?)</dateVideo>.*?' +
1276                 '<url quality="hd">(.*?)</url>',
1277             re.DOTALL,
1278             [
1279                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1280                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1281                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1282                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1283             ]
1284         )
1285
1286         return {
1287             'id':           info.get('id'),
1288             'url':          compat_urllib_parse.unquote(info.get('url')),
1289             'uploader':     u'arte.tv',
1290             'upload_date':  info.get('date'),
1291             'title':        info.get('title').decode('utf-8'),
1292             'ext':          u'mp4',
1293             'format':       u'NA',
1294             'player_url':   None,
1295         }
1296
1297     def _real_extract(self, url):
1298         video_id = url.split('/')[-1]
1299         self.report_extraction(video_id)
1300
1301         if re.search(self._LIVE_URL, video_id) is not None:
1302             self.extractLiveStream(url)
1303             return
1304         else:
1305             info = self.extractPlus7Stream(url)
1306
1307         return [info]
1308
1309
1310 class GenericIE(InfoExtractor):
1311     """Generic last-resort information extractor."""
1312
1313     _VALID_URL = r'.*'
1314     IE_NAME = u'generic'
1315
1316     def __init__(self, downloader=None):
1317         InfoExtractor.__init__(self, downloader)
1318
1319     def report_download_webpage(self, video_id):
1320         """Report webpage download."""
1321         if not self._downloader.params.get('test', False):
1322             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1323         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1324
1325     def report_extraction(self, video_id):
1326         """Report information extraction."""
1327         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1328
1329     def report_following_redirect(self, new_url):
1330         """Report information extraction."""
1331         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1332
1333     def _test_redirect(self, url):
1334         """Check if it is a redirect, like url shorteners, in case restart chain."""
1335         class HeadRequest(compat_urllib_request.Request):
1336             def get_method(self):
1337                 return "HEAD"
1338
1339         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1340             """
1341             Subclass the HTTPRedirectHandler to make it use our
1342             HeadRequest also on the redirected URL
1343             """
1344             def redirect_request(self, req, fp, code, msg, headers, newurl):
1345                 if code in (301, 302, 303, 307):
1346                     newurl = newurl.replace(' ', '%20')
1347                     newheaders = dict((k,v) for k,v in req.headers.items()
1348                                       if k.lower() not in ("content-length", "content-type"))
1349                     return HeadRequest(newurl,
1350                                        headers=newheaders,
1351                                        origin_req_host=req.get_origin_req_host(),
1352                                        unverifiable=True)
1353                 else:
1354                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1355
1356         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1357             """
1358             Fallback to GET if HEAD is not allowed (405 HTTP error)
1359             """
1360             def http_error_405(self, req, fp, code, msg, headers):
1361                 fp.read()
1362                 fp.close()
1363
1364                 newheaders = dict((k,v) for k,v in req.headers.items()
1365                                   if k.lower() not in ("content-length", "content-type"))
1366                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1367                                                  headers=newheaders,
1368                                                  origin_req_host=req.get_origin_req_host(),
1369                                                  unverifiable=True))
1370
1371         # Build our opener
1372         opener = compat_urllib_request.OpenerDirector()
1373         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1374                         HTTPMethodFallback, HEADRedirectHandler,
1375                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1376             opener.add_handler(handler())
1377
1378         response = opener.open(HeadRequest(url))
1379         new_url = response.geturl()
1380
1381         if url == new_url:
1382             return False
1383
1384         self.report_following_redirect(new_url)
1385         self._downloader.download([new_url])
1386         return True
1387
1388     def _real_extract(self, url):
1389         if self._test_redirect(url): return
1390
1391         video_id = url.split('/')[-1]
1392         try:
1393             webpage = self._download_webpage(url, video_id)
1394         except ValueError as err:
1395             # since this is the last-resort InfoExtractor, if
1396             # this error is thrown, it'll be thrown here
1397             self._downloader.report_error(u'Invalid URL: %s' % url)
1398             return
1399
1400         self.report_extraction(video_id)
1401         # Start with something easy: JW Player in SWFObject
1402         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit
1405             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit: JWPlayer JS loader
1408             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             self._downloader.report_error(u'Invalid URL: %s' % url)
1411             return
1412
1413         # It's possible that one of the regexes
1414         # matched, but returned an empty group:
1415         if mobj.group(1) is None:
1416             self._downloader.report_error(u'Invalid URL: %s' % url)
1417             return
1418
1419         video_url = compat_urllib_parse.unquote(mobj.group(1))
1420         video_id = os.path.basename(video_url)
1421
1422         # here's a fun little line of code for you:
1423         video_extension = os.path.splitext(video_id)[1][1:]
1424         video_id = os.path.splitext(video_id)[0]
1425
1426         # it's tempting to parse this further, but you would
1427         # have to take into account all the variations like
1428         #   Video Title - Site Name
1429         #   Site Name | Video Title
1430         #   Video Title - Tagline | Site Name
1431         # and so on and so forth; it's just not practical
1432         mobj = re.search(r'<title>(.*)</title>', webpage)
1433         if mobj is None:
1434             self._downloader.report_error(u'unable to extract title')
1435             return
1436         video_title = mobj.group(1)
1437
1438         # video uploader is domain name
1439         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1440         if mobj is None:
1441             self._downloader.report_error(u'unable to extract title')
1442             return
1443         video_uploader = mobj.group(1)
1444
1445         return [{
1446             'id':       video_id,
1447             'url':      video_url,
1448             'uploader': video_uploader,
1449             'upload_date':  None,
1450             'title':    video_title,
1451             'ext':      video_extension,
1452         }]
1453
1454
1455 class YoutubeSearchIE(InfoExtractor):
1456     """Information Extractor for YouTube search queries."""
1457     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1458     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1459     _max_youtube_results = 1000
1460     IE_NAME = u'youtube:search'
1461
1462     def __init__(self, downloader=None):
1463         InfoExtractor.__init__(self, downloader)
1464
1465     def report_download_page(self, query, pagenum):
1466         """Report attempt to download search page with given number."""
1467         query = query.decode(preferredencoding())
1468         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1469
1470     def _real_extract(self, query):
1471         mobj = re.match(self._VALID_URL, query)
1472         if mobj is None:
1473             self._downloader.report_error(u'invalid search query "%s"' % query)
1474             return
1475
1476         prefix, query = query.split(':')
1477         prefix = prefix[8:]
1478         query = query.encode('utf-8')
1479         if prefix == '':
1480             self._download_n_results(query, 1)
1481             return
1482         elif prefix == 'all':
1483             self._download_n_results(query, self._max_youtube_results)
1484             return
1485         else:
1486             try:
1487                 n = int(prefix)
1488                 if n <= 0:
1489                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1490                     return
1491                 elif n > self._max_youtube_results:
1492                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1493                     n = self._max_youtube_results
1494                 self._download_n_results(query, n)
1495                 return
1496             except ValueError: # parsing prefix as integer fails
1497                 self._download_n_results(query, 1)
1498                 return
1499
1500     def _download_n_results(self, query, n):
1501         """Downloads a specified number of results for a query"""
1502
1503         video_ids = []
1504         pagenum = 0
1505         limit = n
1506
1507         while (50 * pagenum) < limit:
1508             self.report_download_page(query, pagenum+1)
1509             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1510             request = compat_urllib_request.Request(result_url)
1511             try:
1512                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1513             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1514                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1515                 return
1516             api_response = json.loads(data)['data']
1517
1518             if not 'items' in api_response:
1519                 self._downloader.trouble(u'[youtube] No video results')
1520                 return
1521
1522             new_ids = list(video['id'] for video in api_response['items'])
1523             video_ids += new_ids
1524
1525             limit = min(n, api_response['totalItems'])
1526             pagenum += 1
1527
1528         if len(video_ids) > n:
1529             video_ids = video_ids[:n]
1530         for id in video_ids:
1531             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1532         return
1533
1534
1535 class GoogleSearchIE(InfoExtractor):
1536     """Information Extractor for Google Video search queries."""
1537     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1538     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1539     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1540     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1541     _max_google_results = 1000
1542     IE_NAME = u'video.google:search'
1543
1544     def __init__(self, downloader=None):
1545         InfoExtractor.__init__(self, downloader)
1546
1547     def report_download_page(self, query, pagenum):
1548         """Report attempt to download playlist page with given number."""
1549         query = query.decode(preferredencoding())
1550         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1551
1552     def _real_extract(self, query):
1553         mobj = re.match(self._VALID_URL, query)
1554         if mobj is None:
1555             self._downloader.report_error(u'invalid search query "%s"' % query)
1556             return
1557
1558         prefix, query = query.split(':')
1559         prefix = prefix[8:]
1560         query = query.encode('utf-8')
1561         if prefix == '':
1562             self._download_n_results(query, 1)
1563             return
1564         elif prefix == 'all':
1565             self._download_n_results(query, self._max_google_results)
1566             return
1567         else:
1568             try:
1569                 n = int(prefix)
1570                 if n <= 0:
1571                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1572                     return
1573                 elif n > self._max_google_results:
1574                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1575                     n = self._max_google_results
1576                 self._download_n_results(query, n)
1577                 return
1578             except ValueError: # parsing prefix as integer fails
1579                 self._download_n_results(query, 1)
1580                 return
1581
1582     def _download_n_results(self, query, n):
1583         """Downloads a specified number of results for a query"""
1584
1585         video_ids = []
1586         pagenum = 0
1587
1588         while True:
1589             self.report_download_page(query, pagenum)
1590             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1591             request = compat_urllib_request.Request(result_url)
1592             try:
1593                 page = compat_urllib_request.urlopen(request).read()
1594             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1595                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1596                 return
1597
1598             # Extract video identifiers
1599             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1600                 video_id = mobj.group(1)
1601                 if video_id not in video_ids:
1602                     video_ids.append(video_id)
1603                     if len(video_ids) == n:
1604                         # Specified n videos reached
1605                         for id in video_ids:
1606                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1607                         return
1608
1609             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610                 for id in video_ids:
1611                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1612                 return
1613
1614             pagenum = pagenum + 1
1615
1616
1617 class YahooSearchIE(InfoExtractor):
1618     """Information Extractor for Yahoo! Video search queries."""
1619
1620     _WORKING = False
1621     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1622     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1623     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1624     _MORE_PAGES_INDICATOR = r'\s*Next'
1625     _max_yahoo_results = 1000
1626     IE_NAME = u'video.yahoo:search'
1627
1628     def __init__(self, downloader=None):
1629         InfoExtractor.__init__(self, downloader)
1630
1631     def report_download_page(self, query, pagenum):
1632         """Report attempt to download playlist page with given number."""
1633         query = query.decode(preferredencoding())
1634         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1635
1636     def _real_extract(self, query):
1637         mobj = re.match(self._VALID_URL, query)
1638         if mobj is None:
1639             self._downloader.report_error(u'invalid search query "%s"' % query)
1640             return
1641
1642         prefix, query = query.split(':')
1643         prefix = prefix[8:]
1644         query = query.encode('utf-8')
1645         if prefix == '':
1646             self._download_n_results(query, 1)
1647             return
1648         elif prefix == 'all':
1649             self._download_n_results(query, self._max_yahoo_results)
1650             return
1651         else:
1652             try:
1653                 n = int(prefix)
1654                 if n <= 0:
1655                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1656                     return
1657                 elif n > self._max_yahoo_results:
1658                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1659                     n = self._max_yahoo_results
1660                 self._download_n_results(query, n)
1661                 return
1662             except ValueError: # parsing prefix as integer fails
1663                 self._download_n_results(query, 1)
1664                 return
1665
1666     def _download_n_results(self, query, n):
1667         """Downloads a specified number of results for a query"""
1668
1669         video_ids = []
1670         already_seen = set()
1671         pagenum = 1
1672
1673         while True:
1674             self.report_download_page(query, pagenum)
1675             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1676             request = compat_urllib_request.Request(result_url)
1677             try:
1678                 page = compat_urllib_request.urlopen(request).read()
1679             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1680                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1681                 return
1682
1683             # Extract video identifiers
1684             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1685                 video_id = mobj.group(1)
1686                 if video_id not in already_seen:
1687                     video_ids.append(video_id)
1688                     already_seen.add(video_id)
1689                     if len(video_ids) == n:
1690                         # Specified n videos reached
1691                         for id in video_ids:
1692                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1693                         return
1694
1695             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1696                 for id in video_ids:
1697                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1698                 return
1699
1700             pagenum = pagenum + 1
1701
1702
1703 class YoutubePlaylistIE(InfoExtractor):
1704     """Information Extractor for YouTube playlists."""
1705
1706     _VALID_URL = r"""(?:
1707                         (?:https?://)?
1708                         (?:\w+\.)?
1709                         youtube\.com/
1710                         (?:
1711                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1712                            \? (?:.*?&)*? (?:p|a|list)=
1713                         |  user/.*?/user/
1714                         |  p/
1715                         |  user/.*?#[pg]/c/
1716                         )
1717                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1718                         .*
1719                      |
1720                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1721                      )"""
1722     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1723     _MAX_RESULTS = 50
1724     IE_NAME = u'youtube:playlist'
1725
1726     def __init__(self, downloader=None):
1727         InfoExtractor.__init__(self, downloader)
1728
1729     @classmethod
1730     def suitable(cls, url):
1731         """Receives a URL and returns True if suitable for this IE."""
1732         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1733
1734     def report_download_page(self, playlist_id, pagenum):
1735         """Report attempt to download playlist page with given number."""
1736         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1737
1738     def _real_extract(self, url):
1739         # Extract playlist id
1740         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1741         if mobj is None:
1742             self._downloader.report_error(u'invalid url: %s' % url)
1743             return
1744
1745         # Download playlist videos from API
1746         playlist_id = mobj.group(1) or mobj.group(2)
1747         page_num = 1
1748         videos = []
1749
1750         while True:
1751             self.report_download_page(playlist_id, page_num)
1752
1753             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1754             try:
1755                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1756             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1758                 return
1759
1760             try:
1761                 response = json.loads(page)
1762             except ValueError as err:
1763                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1764                 return
1765
1766             if not 'feed' in response or not 'entry' in response['feed']:
1767                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1768                 return
1769             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1770                         for entry in response['feed']['entry']
1771                         if 'content' in entry ]
1772
1773             if len(response['feed']['entry']) < self._MAX_RESULTS:
1774                 break
1775             page_num += 1
1776
1777         videos = [v[1] for v in sorted(videos)]
1778         total = len(videos)
1779
1780         playliststart = self._downloader.params.get('playliststart', 1) - 1
1781         playlistend = self._downloader.params.get('playlistend', -1)
1782         if playlistend == -1:
1783             videos = videos[playliststart:]
1784         else:
1785             videos = videos[playliststart:playlistend]
1786
1787         if len(videos) == total:
1788             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1789         else:
1790             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1791
1792         for video in videos:
1793             self._downloader.download([video])
1794         return
1795
1796
1797 class YoutubeChannelIE(InfoExtractor):
1798     """Information Extractor for YouTube channels."""
1799
1800     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1801     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1802     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1803     IE_NAME = u'youtube:channel'
1804
1805     def report_download_page(self, channel_id, pagenum):
1806         """Report attempt to download channel page with given number."""
1807         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1808
1809     def _real_extract(self, url):
1810         # Extract channel id
1811         mobj = re.match(self._VALID_URL, url)
1812         if mobj is None:
1813             self._downloader.report_error(u'invalid url: %s' % url)
1814             return
1815
1816         # Download channel pages
1817         channel_id = mobj.group(1)
1818         video_ids = []
1819         pagenum = 1
1820
1821         while True:
1822             self.report_download_page(channel_id, pagenum)
1823             url = self._TEMPLATE_URL % (channel_id, pagenum)
1824             request = compat_urllib_request.Request(url)
1825             try:
1826                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1827             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1829                 return
1830
1831             # Extract video identifiers
1832             ids_in_page = []
1833             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1834                 if mobj.group(1) not in ids_in_page:
1835                     ids_in_page.append(mobj.group(1))
1836             video_ids.extend(ids_in_page)
1837
1838             if self._MORE_PAGES_INDICATOR not in page:
1839                 break
1840             pagenum = pagenum + 1
1841
1842         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1843
1844         for id in video_ids:
1845             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1846         return
1847
1848
1849 class YoutubeUserIE(InfoExtractor):
1850     """Information Extractor for YouTube users."""
1851
1852     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1853     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1854     _GDATA_PAGE_SIZE = 50
1855     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1856     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1857     IE_NAME = u'youtube:user'
1858
1859     def __init__(self, downloader=None):
1860         InfoExtractor.__init__(self, downloader)
1861
1862     def report_download_page(self, username, start_index):
1863         """Report attempt to download user page."""
1864         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1865                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1866
1867     def _real_extract(self, url):
1868         # Extract username
1869         mobj = re.match(self._VALID_URL, url)
1870         if mobj is None:
1871             self._downloader.report_error(u'invalid url: %s' % url)
1872             return
1873
1874         username = mobj.group(1)
1875
1876         # Download video ids using YouTube Data API. Result size per
1877         # query is limited (currently to 50 videos) so we need to query
1878         # page by page until there are no video ids - it means we got
1879         # all of them.
1880
1881         video_ids = []
1882         pagenum = 0
1883
1884         while True:
1885             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1886             self.report_download_page(username, start_index)
1887
1888             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1889
1890             try:
1891                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1892             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1893                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1894                 return
1895
1896             # Extract video identifiers
1897             ids_in_page = []
1898
1899             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1900                 if mobj.group(1) not in ids_in_page:
1901                     ids_in_page.append(mobj.group(1))
1902
1903             video_ids.extend(ids_in_page)
1904
1905             # A little optimization - if current page is not
1906             # "full", ie. does not contain PAGE_SIZE video ids then
1907             # we can assume that this page is the last one - there
1908             # are no more ids on further pages - no need to query
1909             # again.
1910
1911             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1912                 break
1913
1914             pagenum += 1
1915
1916         all_ids_count = len(video_ids)
1917         playliststart = self._downloader.params.get('playliststart', 1) - 1
1918         playlistend = self._downloader.params.get('playlistend', -1)
1919
1920         if playlistend == -1:
1921             video_ids = video_ids[playliststart:]
1922         else:
1923             video_ids = video_ids[playliststart:playlistend]
1924
1925         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1926                 (username, all_ids_count, len(video_ids)))
1927
1928         for video_id in video_ids:
1929             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1930
1931
1932 class BlipTVUserIE(InfoExtractor):
1933     """Information Extractor for blip.tv users."""
1934
1935     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1936     _PAGE_SIZE = 12
1937     IE_NAME = u'blip.tv:user'
1938
1939     def __init__(self, downloader=None):
1940         InfoExtractor.__init__(self, downloader)
1941
1942     def report_download_page(self, username, pagenum):
1943         """Report attempt to download user page."""
1944         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1945                 (self.IE_NAME, username, pagenum))
1946
1947     def _real_extract(self, url):
1948         # Extract username
1949         mobj = re.match(self._VALID_URL, url)
1950         if mobj is None:
1951             self._downloader.report_error(u'invalid url: %s' % url)
1952             return
1953
1954         username = mobj.group(1)
1955
1956         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1957
1958         request = compat_urllib_request.Request(url)
1959
1960         try:
1961             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1962             mobj = re.search(r'data-users-id="([^"]+)"', page)
1963             page_base = page_base % mobj.group(1)
1964         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1965             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1966             return
1967
1968
1969         # Download video ids using BlipTV Ajax calls. Result size per
1970         # query is limited (currently to 12 videos) so we need to query
1971         # page by page until there are no video ids - it means we got
1972         # all of them.
1973
1974         video_ids = []
1975         pagenum = 1
1976
1977         while True:
1978             self.report_download_page(username, pagenum)
1979             url = page_base + "&page=" + str(pagenum)
1980             request = compat_urllib_request.Request( url )
1981             try:
1982                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1983             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1984                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1985                 return
1986
1987             # Extract video identifiers
1988             ids_in_page = []
1989
1990             for mobj in re.finditer(r'href="/([^"]+)"', page):
1991                 if mobj.group(1) not in ids_in_page:
1992                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1993
1994             video_ids.extend(ids_in_page)
1995
1996             # A little optimization - if current page is not
1997             # "full", ie. does not contain PAGE_SIZE video ids then
1998             # we can assume that this page is the last one - there
1999             # are no more ids on further pages - no need to query
2000             # again.
2001
2002             if len(ids_in_page) < self._PAGE_SIZE:
2003                 break
2004
2005             pagenum += 1
2006
2007         all_ids_count = len(video_ids)
2008         playliststart = self._downloader.params.get('playliststart', 1) - 1
2009         playlistend = self._downloader.params.get('playlistend', -1)
2010
2011         if playlistend == -1:
2012             video_ids = video_ids[playliststart:]
2013         else:
2014             video_ids = video_ids[playliststart:playlistend]
2015
2016         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2017                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2018
2019         for video_id in video_ids:
2020             self._downloader.download([u'http://blip.tv/'+video_id])
2021
2022
2023 class DepositFilesIE(InfoExtractor):
2024     """Information extractor for depositfiles.com"""
2025
2026     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2027
2028     def report_download_webpage(self, file_id):
2029         """Report webpage download."""
2030         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2031
2032     def report_extraction(self, file_id):
2033         """Report information extraction."""
2034         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2035
2036     def _real_extract(self, url):
2037         file_id = url.split('/')[-1]
2038         # Rebuild url in english locale
2039         url = 'http://depositfiles.com/en/files/' + file_id
2040
2041         # Retrieve file webpage with 'Free download' button pressed
2042         free_download_indication = { 'gateway_result' : '1' }
2043         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2044         try:
2045             self.report_download_webpage(file_id)
2046             webpage = compat_urllib_request.urlopen(request).read()
2047         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2048             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2049             return
2050
2051         # Search for the real file URL
2052         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2053         if (mobj is None) or (mobj.group(1) is None):
2054             # Try to figure out reason of the error.
2055             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2056             if (mobj is not None) and (mobj.group(1) is not None):
2057                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2058                 self._downloader.report_error(u'%s' % restriction_message)
2059             else:
2060                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2061             return
2062
2063         file_url = mobj.group(1)
2064         file_extension = os.path.splitext(file_url)[1][1:]
2065
2066         # Search for file title
2067         mobj = re.search(r'<b title="(.*?)">', webpage)
2068         if mobj is None:
2069             self._downloader.report_error(u'unable to extract title')
2070             return
2071         file_title = mobj.group(1).decode('utf-8')
2072
2073         return [{
2074             'id':       file_id.decode('utf-8'),
2075             'url':      file_url.decode('utf-8'),
2076             'uploader': None,
2077             'upload_date':  None,
2078             'title':    file_title,
2079             'ext':      file_extension.decode('utf-8'),
2080         }]
2081
2082
2083 class FacebookIE(InfoExtractor):
2084     """Information Extractor for Facebook"""
2085
2086     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2087     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2088     _NETRC_MACHINE = 'facebook'
2089     IE_NAME = u'facebook'
2090
2091     def report_login(self):
2092         """Report attempt to log in."""
2093         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2094
2095     def _real_initialize(self):
2096         if self._downloader is None:
2097             return
2098
2099         useremail = None
2100         password = None
2101         downloader_params = self._downloader.params
2102
2103         # Attempt to use provided username and password or .netrc data
2104         if downloader_params.get('username', None) is not None:
2105             useremail = downloader_params['username']
2106             password = downloader_params['password']
2107         elif downloader_params.get('usenetrc', False):
2108             try:
2109                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2110                 if info is not None:
2111                     useremail = info[0]
2112                     password = info[2]
2113                 else:
2114                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2115             except (IOError, netrc.NetrcParseError) as err:
2116                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2117                 return
2118
2119         if useremail is None:
2120             return
2121
2122         # Log in
2123         login_form = {
2124             'email': useremail,
2125             'pass': password,
2126             'login': 'Log+In'
2127             }
2128         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2129         try:
2130             self.report_login()
2131             login_results = compat_urllib_request.urlopen(request).read()
2132             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2133                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2134                 return
2135         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2136             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2137             return
2138
2139     def _real_extract(self, url):
2140         mobj = re.match(self._VALID_URL, url)
2141         if mobj is None:
2142             self._downloader.report_error(u'invalid URL: %s' % url)
2143             return
2144         video_id = mobj.group('ID')
2145
2146         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2147         webpage = self._download_webpage(url, video_id)
2148
2149         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2150         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2151         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2152         if not m:
2153             raise ExtractorError(u'Cannot parse data')
2154         data = dict(json.loads(m.group(1)))
2155         params_raw = compat_urllib_parse.unquote(data['params'])
2156         params = json.loads(params_raw)
2157         video_url = params['hd_src']
2158         if not video_url:
2159             video_url = params['sd_src']
2160         if not video_url:
2161             raise ExtractorError(u'Cannot find video URL')
2162         video_duration = int(params['video_duration'])
2163
2164         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2165         if not m:
2166             raise ExtractorError(u'Cannot find title in webpage')
2167         video_title = unescapeHTML(m.group(1))
2168
2169         info = {
2170             'id': video_id,
2171             'title': video_title,
2172             'url': video_url,
2173             'ext': 'mp4',
2174             'duration': video_duration,
2175             'thumbnail': params['thumbnail_src'],
2176         }
2177         return [info]
2178
2179
2180 class BlipTVIE(InfoExtractor):
2181     """Information extractor for blip.tv"""
2182
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185     IE_NAME = u'blip.tv'
2186
2187     def report_extraction(self, file_id):
2188         """Report information extraction."""
2189         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191     def report_direct_download(self, title):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195     def _real_extract(self, url):
2196         mobj = re.match(self._VALID_URL, url)
2197         if mobj is None:
2198             self._downloader.report_error(u'invalid URL: %s' % url)
2199             return
2200
2201         urlp = compat_urllib_parse_urlparse(url)
2202         if urlp.path.startswith('/play/'):
2203             request = compat_urllib_request.Request(url)
2204             response = compat_urllib_request.urlopen(request)
2205             redirecturl = response.geturl()
2206             rurlp = compat_urllib_parse_urlparse(redirecturl)
2207             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2208             url = 'http://blip.tv/a/a-' + file_id
2209             return self._real_extract(url)
2210
2211
2212         if '?' in url:
2213             cchar = '&'
2214         else:
2215             cchar = '?'
2216         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2217         request = compat_urllib_request.Request(json_url)
2218         request.add_header('User-Agent', 'iTunes/10.6.1')
2219         self.report_extraction(mobj.group(1))
2220         info = None
2221         try:
2222             urlh = compat_urllib_request.urlopen(request)
2223             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2224                 basename = url.split('/')[-1]
2225                 title,ext = os.path.splitext(basename)
2226                 title = title.decode('UTF-8')
2227                 ext = ext.replace('.', '')
2228                 self.report_direct_download(title)
2229                 info = {
2230                     'id': title,
2231                     'url': url,
2232                     'uploader': None,
2233                     'upload_date': None,
2234                     'title': title,
2235                     'ext': ext,
2236                     'urlhandle': urlh
2237                 }
2238         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2239             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2240         if info is None: # Regular URL
2241             try:
2242                 json_code_bytes = urlh.read()
2243                 json_code = json_code_bytes.decode('utf-8')
2244             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2245                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2246                 return
2247
2248             try:
2249                 json_data = json.loads(json_code)
2250                 if 'Post' in json_data:
2251                     data = json_data['Post']
2252                 else:
2253                     data = json_data
2254
2255                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2256                 video_url = data['media']['url']
2257                 umobj = re.match(self._URL_EXT, video_url)
2258                 if umobj is None:
2259                     raise ValueError('Can not determine filename extension')
2260                 ext = umobj.group(1)
2261
2262                 info = {
2263                     'id': data['item_id'],
2264                     'url': video_url,
2265                     'uploader': data['display_name'],
2266                     'upload_date': upload_date,
2267                     'title': data['title'],
2268                     'ext': ext,
2269                     'format': data['media']['mimeType'],
2270                     'thumbnail': data['thumbnailUrl'],
2271                     'description': data['description'],
2272                     'player_url': data['embedUrl'],
2273                     'user_agent': 'iTunes/10.6.1',
2274                 }
2275             except (ValueError,KeyError) as err:
2276                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2277                 return
2278
2279         return [info]
2280
2281
2282 class MyVideoIE(InfoExtractor):
2283     """Information Extractor for myvideo.de."""
2284
2285     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2286     IE_NAME = u'myvideo'
2287
2288     def __init__(self, downloader=None):
2289         InfoExtractor.__init__(self, downloader)
2290
2291     def report_extraction(self, video_id):
2292         """Report information extraction."""
2293         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2294
2295     def _real_extract(self,url):
2296         mobj = re.match(self._VALID_URL, url)
2297         if mobj is None:
2298             self._download.report_error(u'invalid URL: %s' % url)
2299             return
2300
2301         video_id = mobj.group(1)
2302
2303         # Get video webpage
2304         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2305         webpage = self._download_webpage(webpage_url, video_id)
2306
2307         self.report_extraction(video_id)
2308         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2309                  webpage)
2310         if mobj is None:
2311             self._downloader.report_error(u'unable to extract media URL')
2312             return
2313         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2314
2315         mobj = re.search('<title>([^<]+)</title>', webpage)
2316         if mobj is None:
2317             self._downloader.report_error(u'unable to extract title')
2318             return
2319
2320         video_title = mobj.group(1)
2321
2322         return [{
2323             'id':       video_id,
2324             'url':      video_url,
2325             'uploader': None,
2326             'upload_date':  None,
2327             'title':    video_title,
2328             'ext':      u'flv',
2329         }]
2330
2331 class ComedyCentralIE(InfoExtractor):
2332     """Information extractor for The Daily Show and Colbert Report """
2333
2334     # urls can be abbreviations like :thedailyshow or :colbert
2335     # urls for episodes like:
2336     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2337     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2338     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2339     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2340                       |(https?://)?(www\.)?
2341                           (?P<showname>thedailyshow|colbertnation)\.com/
2342                          (full-episodes/(?P<episode>.*)|
2343                           (?P<clip>
2344                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2345                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2346                      $"""
2347
2348     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349
2350     _video_extensions = {
2351         '3500': 'mp4',
2352         '2200': 'mp4',
2353         '1700': 'mp4',
2354         '1200': 'mp4',
2355         '750': 'mp4',
2356         '400': 'mp4',
2357     }
2358     _video_dimensions = {
2359         '3500': '1280x720',
2360         '2200': '960x540',
2361         '1700': '768x432',
2362         '1200': '640x360',
2363         '750': '512x288',
2364         '400': '384x216',
2365     }
2366
2367     @classmethod
2368     def suitable(cls, url):
2369         """Receives a URL and returns True if suitable for this IE."""
2370         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2371
2372     def report_extraction(self, episode_id):
2373         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2374
2375     def report_config_download(self, episode_id, media_id):
2376         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2377
2378     def report_index_download(self, episode_id):
2379         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2380
2381     def _print_formats(self, formats):
2382         print('Available formats:')
2383         for x in formats:
2384             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2385
2386
2387     def _real_extract(self, url):
2388         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2389         if mobj is None:
2390             self._downloader.report_error(u'invalid URL: %s' % url)
2391             return
2392
2393         if mobj.group('shortname'):
2394             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2395                 url = u'http://www.thedailyshow.com/full-episodes/'
2396             else:
2397                 url = u'http://www.colbertnation.com/full-episodes/'
2398             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2399             assert mobj is not None
2400
2401         if mobj.group('clip'):
2402             if mobj.group('showname') == 'thedailyshow':
2403                 epTitle = mobj.group('tdstitle')
2404             else:
2405                 epTitle = mobj.group('cntitle')
2406             dlNewest = False
2407         else:
2408             dlNewest = not mobj.group('episode')
2409             if dlNewest:
2410                 epTitle = mobj.group('showname')
2411             else:
2412                 epTitle = mobj.group('episode')
2413
2414         req = compat_urllib_request.Request(url)
2415         self.report_extraction(epTitle)
2416         try:
2417             htmlHandle = compat_urllib_request.urlopen(req)
2418             html = htmlHandle.read()
2419             webpage = html.decode('utf-8')
2420         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2422             return
2423         if dlNewest:
2424             url = htmlHandle.geturl()
2425             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2426             if mobj is None:
2427                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2428                 return
2429             if mobj.group('episode') == '':
2430                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2431                 return
2432             epTitle = mobj.group('episode')
2433
2434         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2435
2436         if len(mMovieParams) == 0:
2437             # The Colbert Report embeds the information in a without
2438             # a URL prefix; so extract the alternate reference
2439             # and then add the URL prefix manually.
2440
2441             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2442             if len(altMovieParams) == 0:
2443                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2444                 return
2445             else:
2446                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2447
2448         uri = mMovieParams[0][1]
2449         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2450         self.report_index_download(epTitle)
2451         try:
2452             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2455             return
2456
2457         results = []
2458
2459         idoc = xml.etree.ElementTree.fromstring(indexXml)
2460         itemEls = idoc.findall('.//item')
2461         for partNum,itemEl in enumerate(itemEls):
2462             mediaId = itemEl.findall('./guid')[0].text
2463             shortMediaId = mediaId.split(':')[-1]
2464             showId = mediaId.split(':')[-2].replace('.com', '')
2465             officialTitle = itemEl.findall('./title')[0].text
2466             officialDate = itemEl.findall('./pubDate')[0].text
2467
2468             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2469                         compat_urllib_parse.urlencode({'uri': mediaId}))
2470             configReq = compat_urllib_request.Request(configUrl)
2471             self.report_config_download(epTitle, shortMediaId)
2472             try:
2473                 configXml = compat_urllib_request.urlopen(configReq).read()
2474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2476                 return
2477
2478             cdoc = xml.etree.ElementTree.fromstring(configXml)
2479             turls = []
2480             for rendition in cdoc.findall('.//rendition'):
2481                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2482                 turls.append(finfo)
2483
2484             if len(turls) == 0:
2485                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2486                 continue
2487
2488             if self._downloader.params.get('listformats', None):
2489                 self._print_formats([i[0] for i in turls])
2490                 return
2491
2492             # For now, just pick the highest bitrate
2493             format,rtmp_video_url = turls[-1]
2494
2495             # Get the format arg from the arg stream
2496             req_format = self._downloader.params.get('format', None)
2497
2498             # Select format if we can find one
2499             for f,v in turls:
2500                 if f == req_format:
2501                     format, rtmp_video_url = f, v
2502                     break
2503
2504             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2505             if not m:
2506                 raise ExtractorError(u'Cannot transform RTMP url')
2507             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2508             video_url = base + m.group('finalid')
2509
2510             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2511             info = {
2512                 'id': shortMediaId,
2513                 'url': video_url,
2514                 'uploader': showId,
2515                 'upload_date': officialDate,
2516                 'title': effTitle,
2517                 'ext': 'mp4',
2518                 'format': format,
2519                 'thumbnail': None,
2520                 'description': officialTitle,
2521             }
2522             results.append(info)
2523
2524         return results
2525
2526
2527 class EscapistIE(InfoExtractor):
2528     """Information extractor for The Escapist """
2529
2530     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2531     IE_NAME = u'escapist'
2532
2533     def report_extraction(self, showName):
2534         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2535
2536     def report_config_download(self, showName):
2537         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2538
2539     def _real_extract(self, url):
2540         mobj = re.match(self._VALID_URL, url)
2541         if mobj is None:
2542             self._downloader.report_error(u'invalid URL: %s' % url)
2543             return
2544         showName = mobj.group('showname')
2545         videoId = mobj.group('episode')
2546
2547         self.report_extraction(showName)
2548         try:
2549             webPage = compat_urllib_request.urlopen(url)
2550             webPageBytes = webPage.read()
2551             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2552             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2553         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2555             return
2556
2557         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2558         description = unescapeHTML(descMatch.group(1))
2559         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2560         imgUrl = unescapeHTML(imgMatch.group(1))
2561         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2562         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2563         configUrlMatch = re.search('config=(.*)$', playerUrl)
2564         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2565
2566         self.report_config_download(showName)
2567         try:
2568             configJSON = compat_urllib_request.urlopen(configUrl)
2569             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2570             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2571         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2572             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2573             return
2574
2575         # Technically, it's JavaScript, not JSON
2576         configJSON = configJSON.replace("'", '"')
2577
2578         try:
2579             config = json.loads(configJSON)
2580         except (ValueError,) as err:
2581             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2582             return
2583
2584         playlist = config['playlist']
2585         videoUrl = playlist[1]['url']
2586
2587         info = {
2588             'id': videoId,
2589             'url': videoUrl,
2590             'uploader': showName,
2591             'upload_date': None,
2592             'title': showName,
2593             'ext': 'mp4',
2594             'thumbnail': imgUrl,
2595             'description': description,
2596             'player_url': playerUrl,
2597         }
2598
2599         return [info]
2600
2601 class CollegeHumorIE(InfoExtractor):
2602     """Information extractor for collegehumor.com"""
2603
2604     _WORKING = False
2605     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2606     IE_NAME = u'collegehumor'
2607
2608     def report_manifest(self, video_id):
2609         """Report information extraction."""
2610         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2611
2612     def report_extraction(self, video_id):
2613         """Report information extraction."""
2614         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2615
2616     def _real_extract(self, url):
2617         mobj = re.match(self._VALID_URL, url)
2618         if mobj is None:
2619             self._downloader.report_error(u'invalid URL: %s' % url)
2620             return
2621         video_id = mobj.group('videoid')
2622
2623         info = {
2624             'id': video_id,
2625             'uploader': None,
2626             'upload_date': None,
2627         }
2628
2629         self.report_extraction(video_id)
2630         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2631         try:
2632             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2633         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2634             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2635             return
2636
2637         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2638         try:
2639             videoNode = mdoc.findall('./video')[0]
2640             info['description'] = videoNode.findall('./description')[0].text
2641             info['title'] = videoNode.findall('./caption')[0].text
2642             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2643             manifest_url = videoNode.findall('./file')[0].text
2644         except IndexError:
2645             self._downloader.report_error(u'Invalid metadata XML file')
2646             return
2647
2648         manifest_url += '?hdcore=2.10.3'
2649         self.report_manifest(video_id)
2650         try:
2651             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2652         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2654             return
2655
2656         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2657         try:
2658             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2659             node_id = media_node.attrib['url']
2660             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2661         except IndexError as err:
2662             self._downloader.report_error(u'Invalid manifest file')
2663             return
2664
2665         url_pr = compat_urllib_parse_urlparse(manifest_url)
2666         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2667
2668         info['url'] = url
2669         info['ext'] = 'f4f'
2670         return [info]
2671
2672
2673 class XVideosIE(InfoExtractor):
2674     """Information extractor for xvideos.com"""
2675
2676     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2677     IE_NAME = u'xvideos'
2678
2679     def report_extraction(self, video_id):
2680         """Report information extraction."""
2681         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2682
2683     def _real_extract(self, url):
2684         mobj = re.match(self._VALID_URL, url)
2685         if mobj is None:
2686             self._downloader.report_error(u'invalid URL: %s' % url)
2687             return
2688         video_id = mobj.group(1)
2689
2690         webpage = self._download_webpage(url, video_id)
2691
2692         self.report_extraction(video_id)
2693
2694
2695         # Extract video URL
2696         mobj = re.search(r'flv_url=(.+?)&', webpage)
2697         if mobj is None:
2698             self._downloader.report_error(u'unable to extract video url')
2699             return
2700         video_url = compat_urllib_parse.unquote(mobj.group(1))
2701
2702
2703         # Extract title
2704         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2705         if mobj is None:
2706             self._downloader.report_error(u'unable to extract video title')
2707             return
2708         video_title = mobj.group(1)
2709
2710
2711         # Extract video thumbnail
2712         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2713         if mobj is None:
2714             self._downloader.report_error(u'unable to extract video thumbnail')
2715             return
2716         video_thumbnail = mobj.group(0)
2717
2718         info = {
2719             'id': video_id,
2720             'url': video_url,
2721             'uploader': None,
2722             'upload_date': None,
2723             'title': video_title,
2724             'ext': 'flv',
2725             'thumbnail': video_thumbnail,
2726             'description': None,
2727         }
2728
2729         return [info]
2730
2731
2732 class SoundcloudIE(InfoExtractor):
2733     """Information extractor for soundcloud.com
2734        To access the media, the uid of the song and a stream token
2735        must be extracted from the page source and the script must make
2736        a request to media.soundcloud.com/crossdomain.xml. Then
2737        the media can be grabbed by requesting from an url composed
2738        of the stream token and uid
2739      """
2740
2741     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2742     IE_NAME = u'soundcloud'
2743
2744     def __init__(self, downloader=None):
2745         InfoExtractor.__init__(self, downloader)
2746
2747     def report_resolve(self, video_id):
2748         """Report information extraction."""
2749         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2750
2751     def report_extraction(self, video_id):
2752         """Report information extraction."""
2753         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2754
2755     def _real_extract(self, url):
2756         mobj = re.match(self._VALID_URL, url)
2757         if mobj is None:
2758             self._downloader.report_error(u'invalid URL: %s' % url)
2759             return
2760
2761         # extract uploader (which is in the url)
2762         uploader = mobj.group(1)
2763         # extract simple title (uploader + slug of song title)
2764         slug_title =  mobj.group(2)
2765         simple_title = uploader + u'-' + slug_title
2766
2767         self.report_resolve('%s/%s' % (uploader, slug_title))
2768
2769         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2770         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2771         request = compat_urllib_request.Request(resolv_url)
2772         try:
2773             info_json_bytes = compat_urllib_request.urlopen(request).read()
2774             info_json = info_json_bytes.decode('utf-8')
2775         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2777             return
2778
2779         info = json.loads(info_json)
2780         video_id = info['id']
2781         self.report_extraction('%s/%s' % (uploader, slug_title))
2782
2783         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2784         request = compat_urllib_request.Request(streams_url)
2785         try:
2786             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2787             stream_json = stream_json_bytes.decode('utf-8')
2788         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2789             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2790             return
2791
2792         streams = json.loads(stream_json)
2793         mediaURL = streams['http_mp3_128_url']
2794
2795         return [{
2796             'id':       info['id'],
2797             'url':      mediaURL,
2798             'uploader': info['user']['username'],
2799             'upload_date':  info['created_at'],
2800             'title':    info['title'],
2801             'ext':      u'mp3',
2802             'description': info['description'],
2803         }]
2804
2805
2806 class InfoQIE(InfoExtractor):
2807     """Information extractor for infoq.com"""
2808     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2809
2810     def report_extraction(self, video_id):
2811         """Report information extraction."""
2812         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2813
2814     def _real_extract(self, url):
2815         mobj = re.match(self._VALID_URL, url)
2816         if mobj is None:
2817             self._downloader.report_error(u'invalid URL: %s' % url)
2818             return
2819
2820         webpage = self._download_webpage(url, video_id=url)
2821         self.report_extraction(url)
2822
2823         # Extract video URL
2824         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2825         if mobj is None:
2826             self._downloader.report_error(u'unable to extract video url')
2827             return
2828         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2829         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2830
2831         # Extract title
2832         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2833         if mobj is None:
2834             self._downloader.report_error(u'unable to extract video title')
2835             return
2836         video_title = mobj.group(1)
2837
2838         # Extract description
2839         video_description = u'No description available.'
2840         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2841         if mobj is not None:
2842             video_description = mobj.group(1)
2843
2844         video_filename = video_url.split('/')[-1]
2845         video_id, extension = video_filename.split('.')
2846
2847         info = {
2848             'id': video_id,
2849             'url': video_url,
2850             'uploader': None,
2851             'upload_date': None,
2852             'title': video_title,
2853             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2854             'thumbnail': None,
2855             'description': video_description,
2856         }
2857
2858         return [info]
2859
2860 class MixcloudIE(InfoExtractor):
2861     """Information extractor for www.mixcloud.com"""
2862
2863     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2864     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2865     IE_NAME = u'mixcloud'
2866
2867     def __init__(self, downloader=None):
2868         InfoExtractor.__init__(self, downloader)
2869
2870     def report_download_json(self, file_id):
2871         """Report JSON download."""
2872         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2873
2874     def report_extraction(self, file_id):
2875         """Report information extraction."""
2876         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2877
2878     def get_urls(self, jsonData, fmt, bitrate='best'):
2879         """Get urls from 'audio_formats' section in json"""
2880         file_url = None
2881         try:
2882             bitrate_list = jsonData[fmt]
2883             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2884                 bitrate = max(bitrate_list) # select highest
2885
2886             url_list = jsonData[fmt][bitrate]
2887         except TypeError: # we have no bitrate info.
2888             url_list = jsonData[fmt]
2889         return url_list
2890
2891     def check_urls(self, url_list):
2892         """Returns 1st active url from list"""
2893         for url in url_list:
2894             try:
2895                 compat_urllib_request.urlopen(url)
2896                 return url
2897             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2898                 url = None
2899
2900         return None
2901
2902     def _print_formats(self, formats):
2903         print('Available formats:')
2904         for fmt in formats.keys():
2905             for b in formats[fmt]:
2906                 try:
2907                     ext = formats[fmt][b][0]
2908                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2909                 except TypeError: # we have no bitrate info
2910                     ext = formats[fmt][0]
2911                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2912                     break
2913
2914     def _real_extract(self, url):
2915         mobj = re.match(self._VALID_URL, url)
2916         if mobj is None:
2917             self._downloader.report_error(u'invalid URL: %s' % url)
2918             return
2919         # extract uploader & filename from url
2920         uploader = mobj.group(1).decode('utf-8')
2921         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2922
2923         # construct API request
2924         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2925         # retrieve .json file with links to files
2926         request = compat_urllib_request.Request(file_url)
2927         try:
2928             self.report_download_json(file_url)
2929             jsonData = compat_urllib_request.urlopen(request).read()
2930         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2931             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2932             return
2933
2934         # parse JSON
2935         json_data = json.loads(jsonData)
2936         player_url = json_data['player_swf_url']
2937         formats = dict(json_data['audio_formats'])
2938
2939         req_format = self._downloader.params.get('format', None)
2940         bitrate = None
2941
2942         if self._downloader.params.get('listformats', None):
2943             self._print_formats(formats)
2944             return
2945
2946         if req_format is None or req_format == 'best':
2947             for format_param in formats.keys():
2948                 url_list = self.get_urls(formats, format_param)
2949                 # check urls
2950                 file_url = self.check_urls(url_list)
2951                 if file_url is not None:
2952                     break # got it!
2953         else:
2954             if req_format not in formats:
2955                 self._downloader.report_error(u'format is not available')
2956                 return
2957
2958             url_list = self.get_urls(formats, req_format)
2959             file_url = self.check_urls(url_list)
2960             format_param = req_format
2961
2962         return [{
2963             'id': file_id.decode('utf-8'),
2964             'url': file_url.decode('utf-8'),
2965             'uploader': uploader.decode('utf-8'),
2966             'upload_date': None,
2967             'title': json_data['name'],
2968             'ext': file_url.split('.')[-1].decode('utf-8'),
2969             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2970             'thumbnail': json_data['thumbnail_url'],
2971             'description': json_data['description'],
2972             'player_url': player_url.decode('utf-8'),
2973         }]
2974
2975 class StanfordOpenClassroomIE(InfoExtractor):
2976     """Information extractor for Stanford's Open ClassRoom"""
2977
2978     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2979     IE_NAME = u'stanfordoc'
2980
2981     def report_download_webpage(self, objid):
2982         """Report information extraction."""
2983         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2984
2985     def report_extraction(self, video_id):
2986         """Report information extraction."""
2987         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2988
2989     def _real_extract(self, url):
2990         mobj = re.match(self._VALID_URL, url)
2991         if mobj is None:
2992             raise ExtractorError(u'Invalid URL: %s' % url)
2993
2994         if mobj.group('course') and mobj.group('video'): # A specific video
2995             course = mobj.group('course')
2996             video = mobj.group('video')
2997             info = {
2998                 'id': course + '_' + video,
2999                 'uploader': None,
3000                 'upload_date': None,
3001             }
3002
3003             self.report_extraction(info['id'])
3004             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3005             xmlUrl = baseUrl + video + '.xml'
3006             try:
3007                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3008             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3010                 return
3011             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3012             try:
3013                 info['title'] = mdoc.findall('./title')[0].text
3014                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3015             except IndexError:
3016                 self._downloader.report_error(u'Invalid metadata XML file')
3017                 return
3018             info['ext'] = info['url'].rpartition('.')[2]
3019             return [info]
3020         elif mobj.group('course'): # A course page
3021             course = mobj.group('course')
3022             info = {
3023                 'id': course,
3024                 'type': 'playlist',
3025                 'uploader': None,
3026                 'upload_date': None,
3027             }
3028
3029             coursepage = self._download_webpage(url, info['id'],
3030                                         note='Downloading course info page',
3031                                         errnote='Unable to download course info page')
3032
3033             m = re.search('<h1>([^<]+)</h1>', coursepage)
3034             if m:
3035                 info['title'] = unescapeHTML(m.group(1))
3036             else:
3037                 info['title'] = info['id']
3038
3039             m = re.search('<description>([^<]+)</description>', coursepage)
3040             if m:
3041                 info['description'] = unescapeHTML(m.group(1))
3042
3043             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3044             info['list'] = [
3045                 {
3046                     'type': 'reference',
3047                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3048                 }
3049                     for vpage in links]
3050             results = []
3051             for entry in info['list']:
3052                 assert entry['type'] == 'reference'
3053                 results += self.extract(entry['url'])
3054             return results
3055         else: # Root page
3056             info = {
3057                 'id': 'Stanford OpenClassroom',
3058                 'type': 'playlist',
3059                 'uploader': None,
3060                 'upload_date': None,
3061             }
3062
3063             self.report_download_webpage(info['id'])
3064             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3065             try:
3066                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3067             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3068                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3069                 return
3070
3071             info['title'] = info['id']
3072
3073             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3074             info['list'] = [
3075                 {
3076                     'type': 'reference',
3077                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3078                 }
3079                     for cpage in links]
3080
3081             results = []
3082             for entry in info['list']:
3083                 assert entry['type'] == 'reference'
3084                 results += self.extract(entry['url'])
3085             return results
3086
3087 class MTVIE(InfoExtractor):
3088     """Information extractor for MTV.com"""
3089
3090     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3091     IE_NAME = u'mtv'
3092
3093     def report_extraction(self, video_id):
3094         """Report information extraction."""
3095         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3096
3097     def _real_extract(self, url):
3098         mobj = re.match(self._VALID_URL, url)
3099         if mobj is None:
3100             self._downloader.report_error(u'invalid URL: %s' % url)
3101             return
3102         if not mobj.group('proto'):
3103             url = 'http://' + url
3104         video_id = mobj.group('videoid')
3105
3106         webpage = self._download_webpage(url, video_id)
3107
3108         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3109         if mobj is None:
3110             self._downloader.report_error(u'unable to extract song name')
3111             return
3112         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3113         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3114         if mobj is None:
3115             self._downloader.report_error(u'unable to extract performer')
3116             return
3117         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3118         video_title = performer + ' - ' + song_name
3119
3120         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3121         if mobj is None:
3122             self._downloader.report_error(u'unable to mtvn_uri')
3123             return
3124         mtvn_uri = mobj.group(1)
3125
3126         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3127         if mobj is None:
3128             self._downloader.report_error(u'unable to extract content id')
3129             return
3130         content_id = mobj.group(1)
3131
3132         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3133         self.report_extraction(video_id)
3134         request = compat_urllib_request.Request(videogen_url)
3135         try:
3136             metadataXml = compat_urllib_request.urlopen(request).read()
3137         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3138             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3139             return
3140
3141         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3142         renditions = mdoc.findall('.//rendition')
3143
3144         # For now, always pick the highest quality.
3145         rendition = renditions[-1]
3146
3147         try:
3148             _,_,ext = rendition.attrib['type'].partition('/')
3149             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3150             video_url = rendition.find('./src').text
3151         except KeyError:
3152             self._downloader.trouble('Invalid rendition field.')
3153             return
3154
3155         info = {
3156             'id': video_id,
3157             'url': video_url,
3158             'uploader': performer,
3159             'upload_date': None,
3160             'title': video_title,
3161             'ext': ext,
3162             'format': format,
3163         }
3164
3165         return [info]
3166
3167
3168 class YoukuIE(InfoExtractor):
3169     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3170
3171     def report_download_webpage(self, file_id):
3172         """Report webpage download."""
3173         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3174
3175     def report_extraction(self, file_id):
3176         """Report information extraction."""
3177         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3178
3179     def _gen_sid(self):
3180         nowTime = int(time.time() * 1000)
3181         random1 = random.randint(1000,1998)
3182         random2 = random.randint(1000,9999)
3183
3184         return "%d%d%d" %(nowTime,random1,random2)
3185
3186     def _get_file_ID_mix_string(self, seed):
3187         mixed = []
3188         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3189         seed = float(seed)
3190         for i in range(len(source)):
3191             seed  =  (seed * 211 + 30031 ) % 65536
3192             index  =  math.floor(seed / 65536 * len(source) )
3193             mixed.append(source[int(index)])
3194             source.remove(source[int(index)])
3195         #return ''.join(mixed)
3196         return mixed
3197
3198     def _get_file_id(self, fileId, seed):
3199         mixed = self._get_file_ID_mix_string(seed)
3200         ids = fileId.split('*')
3201         realId = []
3202         for ch in ids:
3203             if ch:
3204                 realId.append(mixed[int(ch)])
3205         return ''.join(realId)
3206
3207     def _real_extract(self, url):
3208         mobj = re.match(self._VALID_URL, url)
3209         if mobj is None:
3210             self._downloader.report_error(u'invalid URL: %s' % url)
3211             return
3212         video_id = mobj.group('ID')
3213
3214         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3215
3216         request = compat_urllib_request.Request(info_url, None, std_headers)
3217         try:
3218             self.report_download_webpage(video_id)
3219             jsondata = compat_urllib_request.urlopen(request).read()
3220         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3221             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3222             return
3223
3224         self.report_extraction(video_id)
3225         try:
3226             jsonstr = jsondata.decode('utf-8')
3227             config = json.loads(jsonstr)
3228
3229             video_title =  config['data'][0]['title']
3230             seed = config['data'][0]['seed']
3231
3232             format = self._downloader.params.get('format', None)
3233             supported_format = list(config['data'][0]['streamfileids'].keys())
3234
3235             if format is None or format == 'best':
3236                 if 'hd2' in supported_format:
3237                     format = 'hd2'
3238                 else:
3239                     format = 'flv'
3240                 ext = u'flv'
3241             elif format == 'worst':
3242                 format = 'mp4'
3243                 ext = u'mp4'
3244             else:
3245                 format = 'flv'
3246                 ext = u'flv'
3247
3248
3249             fileid = config['data'][0]['streamfileids'][format]
3250             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3251         except (UnicodeDecodeError, ValueError, KeyError):
3252             self._downloader.report_error(u'unable to extract info section')
3253             return
3254
3255         files_info=[]
3256         sid = self._gen_sid()
3257         fileid = self._get_file_id(fileid, seed)
3258
3259         #column 8,9 of fileid represent the segment number
3260         #fileid[7:9] should be changed
3261         for index, key in enumerate(keys):
3262
3263             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3264             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3265
3266             info = {
3267                 'id': '%s_part%02d' % (video_id, index),
3268                 'url': download_url,
3269                 'uploader': None,
3270                 'upload_date': None,
3271                 'title': video_title,
3272                 'ext': ext,
3273             }
3274             files_info.append(info)
3275
3276         return files_info
3277
3278
3279 class XNXXIE(InfoExtractor):
3280     """Information extractor for xnxx.com"""
3281
3282     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3283     IE_NAME = u'xnxx'
3284     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3285     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3286     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3287
3288     def report_webpage(self, video_id):
3289         """Report information extraction"""
3290         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3291
3292     def report_extraction(self, video_id):
3293         """Report information extraction"""
3294         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3295
3296     def _real_extract(self, url):
3297         mobj = re.match(self._VALID_URL, url)
3298         if mobj is None:
3299             self._downloader.report_error(u'invalid URL: %s' % url)
3300             return
3301         video_id = mobj.group(1)
3302
3303         self.report_webpage(video_id)
3304
3305         # Get webpage content
3306         try:
3307             webpage_bytes = compat_urllib_request.urlopen(url).read()
3308             webpage = webpage_bytes.decode('utf-8')
3309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3310             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3311             return
3312
3313         result = re.search(self.VIDEO_URL_RE, webpage)
3314         if result is None:
3315             self._downloader.report_error(u'unable to extract video url')
3316             return
3317         video_url = compat_urllib_parse.unquote(result.group(1))
3318
3319         result = re.search(self.VIDEO_TITLE_RE, webpage)
3320         if result is None:
3321             self._downloader.report_error(u'unable to extract video title')
3322             return
3323         video_title = result.group(1)
3324
3325         result = re.search(self.VIDEO_THUMB_RE, webpage)
3326         if result is None:
3327             self._downloader.report_error(u'unable to extract video thumbnail')
3328             return
3329         video_thumbnail = result.group(1)
3330
3331         return [{
3332             'id': video_id,
3333             'url': video_url,
3334             'uploader': None,
3335             'upload_date': None,
3336             'title': video_title,
3337             'ext': 'flv',
3338             'thumbnail': video_thumbnail,
3339             'description': None,
3340         }]
3341
3342
3343 class GooglePlusIE(InfoExtractor):
3344     """Information extractor for plus.google.com."""
3345
3346     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3347     IE_NAME = u'plus.google'
3348
3349     def __init__(self, downloader=None):
3350         InfoExtractor.__init__(self, downloader)
3351
3352     def report_extract_entry(self, url):
3353         """Report downloading extry"""
3354         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3355
3356     def report_date(self, upload_date):
3357         """Report downloading extry"""
3358         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3359
3360     def report_uploader(self, uploader):
3361         """Report downloading extry"""
3362         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3363
3364     def report_title(self, video_title):
3365         """Report downloading extry"""
3366         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3367
3368     def report_extract_vid_page(self, video_page):
3369         """Report information extraction."""
3370         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3371
3372     def _real_extract(self, url):
3373         # Extract id from URL
3374         mobj = re.match(self._VALID_URL, url)
3375         if mobj is None:
3376             self._downloader.report_error(u'Invalid URL: %s' % url)
3377             return
3378
3379         post_url = mobj.group(0)
3380         video_id = mobj.group(1)
3381
3382         video_extension = 'flv'
3383
3384         # Step 1, Retrieve post webpage to extract further information
3385         self.report_extract_entry(post_url)
3386         request = compat_urllib_request.Request(post_url)
3387         try:
3388             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3389         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3390             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3391             return
3392
3393         # Extract update date
3394         upload_date = None
3395         pattern = 'title="Timestamp">(.*?)</a>'
3396         mobj = re.search(pattern, webpage)
3397         if mobj:
3398             upload_date = mobj.group(1)
3399             # Convert timestring to a format suitable for filename
3400             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3401             upload_date = upload_date.strftime('%Y%m%d')
3402         self.report_date(upload_date)
3403
3404         # Extract uploader
3405         uploader = None
3406         pattern = r'rel\="author".*?>(.*?)</a>'
3407         mobj = re.search(pattern, webpage)
3408         if mobj:
3409             uploader = mobj.group(1)
3410         self.report_uploader(uploader)
3411
3412         # Extract title
3413         # Get the first line for title
3414         video_title = u'NA'
3415         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3416         mobj = re.search(pattern, webpage)
3417         if mobj:
3418             video_title = mobj.group(1)
3419         self.report_title(video_title)
3420
3421         # Step 2, Stimulate clicking the image box to launch video
3422         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3423         mobj = re.search(pattern, webpage)
3424         if mobj is None:
3425             self._downloader.report_error(u'unable to extract video page URL')
3426
3427         video_page = mobj.group(1)
3428         request = compat_urllib_request.Request(video_page)
3429         try:
3430             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3431         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3432             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3433             return
3434         self.report_extract_vid_page(video_page)
3435
3436
3437         # Extract video links on video page
3438         """Extract video links of all sizes"""
3439         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3440         mobj = re.findall(pattern, webpage)
3441         if len(mobj) == 0:
3442             self._downloader.report_error(u'unable to extract video links')
3443
3444         # Sort in resolution
3445         links = sorted(mobj)
3446
3447         # Choose the lowest of the sort, i.e. highest resolution
3448         video_url = links[-1]
3449         # Only get the url. The resolution part in the tuple has no use anymore
3450         video_url = video_url[-1]
3451         # Treat escaped \u0026 style hex
3452         try:
3453             video_url = video_url.decode("unicode_escape")
3454         except AttributeError: # Python 3
3455             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3456
3457
3458         return [{
3459             'id':       video_id,
3460             'url':      video_url,
3461             'uploader': uploader,
3462             'upload_date':  upload_date,
3463             'title':    video_title,
3464             'ext':      video_extension,
3465         }]
3466
3467 class NBAIE(InfoExtractor):
3468     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3469     IE_NAME = u'nba'
3470
3471     def _real_extract(self, url):
3472         mobj = re.match(self._VALID_URL, url)
3473         if mobj is None:
3474             self._downloader.report_error(u'invalid URL: %s' % url)
3475             return
3476
3477         video_id = mobj.group(1)
3478         if video_id.endswith('/index.html'):
3479             video_id = video_id[:-len('/index.html')]
3480
3481         webpage = self._download_webpage(url, video_id)
3482
3483         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3484         def _findProp(rexp, default=None):
3485             m = re.search(rexp, webpage)
3486             if m:
3487                 return unescapeHTML(m.group(1))
3488             else:
3489                 return default
3490
3491         shortened_video_id = video_id.rpartition('/')[2]
3492         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3493         info = {
3494             'id': shortened_video_id,
3495             'url': video_url,
3496             'ext': 'mp4',
3497             'title': title,
3498             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3499             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3500         }
3501         return [info]
3502
3503 class JustinTVIE(InfoExtractor):
3504     """Information extractor for justin.tv and twitch.tv"""
3505     # TODO: One broadcast may be split into multiple videos. The key
3506     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3507     # starts at 1 and increases. Can we treat all parts as one video?
3508
3509     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3510         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3511     _JUSTIN_PAGE_LIMIT = 100
3512     IE_NAME = u'justin.tv'
3513
3514     def report_extraction(self, file_id):
3515         """Report information extraction."""
3516         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3517
3518     def report_download_page(self, channel, offset):
3519         """Report attempt to download a single page of videos."""
3520         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3521                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3522
3523     # Return count of items, list of *valid* items
3524     def _parse_page(self, url):
3525         try:
3526             urlh = compat_urllib_request.urlopen(url)
3527             webpage_bytes = urlh.read()
3528             webpage = webpage_bytes.decode('utf-8', 'ignore')
3529         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3530             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3531             return
3532
3533         response = json.loads(webpage)
3534         if type(response) != list:
3535             error_text = response.get('error', 'unknown error')
3536             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3537             return
3538         info = []
3539         for clip in response:
3540             video_url = clip['video_file_url']
3541             if video_url:
3542                 video_extension = os.path.splitext(video_url)[1][1:]
3543                 video_date = re.sub('-', '', clip['start_time'][:10])
3544                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3545                 video_id = clip['id']
3546                 video_title = clip.get('title', video_id)
3547                 info.append({
3548                     'id': video_id,
3549                     'url': video_url,
3550                     'title': video_title,
3551                     'uploader': clip.get('channel_name', video_uploader_id),
3552                     'uploader_id': video_uploader_id,
3553                     'upload_date': video_date,
3554                     'ext': video_extension,
3555                 })
3556         return (len(response), info)
3557
3558     def _real_extract(self, url):
3559         mobj = re.match(self._VALID_URL, url)
3560         if mobj is None:
3561             self._downloader.report_error(u'invalid URL: %s' % url)
3562             return
3563
3564         api = 'http://api.justin.tv'
3565         video_id = mobj.group(mobj.lastindex)
3566         paged = False
3567         if mobj.lastindex == 1:
3568             paged = True
3569             api += '/channel/archives/%s.json'
3570         else:
3571             api += '/broadcast/by_archive/%s.json'
3572         api = api % (video_id,)
3573
3574         self.report_extraction(video_id)
3575
3576         info = []
3577         offset = 0
3578         limit = self._JUSTIN_PAGE_LIMIT
3579         while True:
3580             if paged:
3581                 self.report_download_page(video_id, offset)
3582             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3583             page_count, page_info = self._parse_page(page_url)
3584             info.extend(page_info)
3585             if not paged or page_count != limit:
3586                 break
3587             offset += limit
3588         return info
3589
3590 class FunnyOrDieIE(InfoExtractor):
3591     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3592
3593     def _real_extract(self, url):
3594         mobj = re.match(self._VALID_URL, url)
3595         if mobj is None:
3596             self._downloader.report_error(u'invalid URL: %s' % url)
3597             return
3598
3599         video_id = mobj.group('id')
3600         webpage = self._download_webpage(url, video_id)
3601
3602         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3603         if not m:
3604             self._downloader.report_error(u'unable to find video information')
3605         video_url = unescapeHTML(m.group('url'))
3606
3607         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3608         if not m:
3609             self._downloader.trouble(u'Cannot find video title')
3610         title = clean_html(m.group('title'))
3611
3612         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3613         if m:
3614             desc = unescapeHTML(m.group('desc'))
3615         else:
3616             desc = None
3617
3618         info = {
3619             'id': video_id,
3620             'url': video_url,
3621             'ext': 'mp4',
3622             'title': title,
3623             'description': desc,
3624         }
3625         return [info]
3626
3627 class SteamIE(InfoExtractor):
3628     _VALID_URL = r"""http://store.steampowered.com/
3629                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3630                 (?P<gameID>\d+)/?
3631                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3632                 """
3633
3634     @classmethod
3635     def suitable(cls, url):
3636         """Receives a URL and returns True if suitable for this IE."""
3637         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3638
3639     def _real_extract(self, url):
3640         m = re.match(self._VALID_URL, url, re.VERBOSE)
3641         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3642         gameID = m.group('gameID')
3643         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3644         webpage = self._download_webpage(videourl, gameID)
3645         mweb = re.finditer(urlRE, webpage)
3646         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3647         titles = re.finditer(namesRE, webpage)
3648         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3649         thumbs = re.finditer(thumbsRE, webpage)
3650         videos = []
3651         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3652             video_id = vid.group('videoID')
3653             title = vtitle.group('videoName')
3654             video_url = vid.group('videoURL')
3655             video_thumb = thumb.group('thumbnail')
3656             if not video_url:
3657                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3658             info = {
3659                 'id':video_id,
3660                 'url':video_url,
3661                 'ext': 'flv',
3662                 'title': unescapeHTML(title),
3663                 'thumbnail': video_thumb
3664                   }
3665             videos.append(info)
3666         return videos
3667
3668 class UstreamIE(InfoExtractor):
3669     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3670     IE_NAME = u'ustream'
3671
3672     def _real_extract(self, url):
3673         m = re.match(self._VALID_URL, url)
3674         video_id = m.group('videoID')
3675         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3676         webpage = self._download_webpage(url, video_id)
3677         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3678         title = m.group('title')
3679         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3680         uploader = m.group('uploader')
3681         info = {
3682                 'id':video_id,
3683                 'url':video_url,
3684                 'ext': 'flv',
3685                 'title': title,
3686                 'uploader': uploader
3687                   }
3688         return [info]
3689
3690 class WorldStarHipHopIE(InfoExtractor):
3691     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3692     IE_NAME = u'WorldStarHipHop'
3693
3694     def _real_extract(self, url):
3695         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3696
3697         webpage_src = compat_urllib_request.urlopen(url).read()
3698         webpage_src = webpage_src.decode('utf-8')
3699
3700         mobj = re.search(_src_url, webpage_src)
3701
3702         m = re.match(self._VALID_URL, url)
3703         video_id = m.group('id')
3704
3705         if mobj is not None:
3706             video_url = mobj.group()
3707             if 'mp4' in video_url:
3708                 ext = 'mp4'
3709             else:
3710                 ext = 'flv'
3711         else:
3712             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3713             return
3714
3715         _title = r"""<title>(.*)</title>"""
3716
3717         mobj = re.search(_title, webpage_src)
3718         
3719         if mobj is not None:
3720             title = mobj.group(1)
3721         else:
3722             title = 'World Start Hip Hop - %s' % time.ctime()
3723
3724         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3725         mobj = re.search(_thumbnail, webpage_src)
3726
3727         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3728         if mobj is not None:
3729             thumbnail = mobj.group(1)
3730         else:
3731             _title = r"""candytitles.*>(.*)</span>"""
3732             mobj = re.search(_title, webpage_src)
3733             if mobj is not None:
3734                 title = mobj.group(1)
3735             thumbnail = None
3736         
3737         results = [{
3738                     'id': video_id,
3739                     'url' : video_url,
3740                     'title' : title,
3741                     'thumbnail' : thumbnail,
3742                     'ext' : ext,
3743                     }]
3744         return results
3745
3746 class RBMARadioIE(InfoExtractor):
3747     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3748
3749     def _real_extract(self, url):
3750         m = re.match(self._VALID_URL, url)
3751         video_id = m.group('videoID')
3752
3753         webpage = self._download_webpage(url, video_id)
3754         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3755         if not m:
3756             raise ExtractorError(u'Cannot find metadata')
3757         json_data = m.group(1)
3758
3759         try:
3760             data = json.loads(json_data)
3761         except ValueError as e:
3762             raise ExtractorError(u'Invalid JSON: ' + str(e))
3763
3764         video_url = data['akamai_url'] + '&cbr=256'
3765         url_parts = compat_urllib_parse_urlparse(video_url)
3766         video_ext = url_parts.path.rpartition('.')[2]
3767         info = {
3768                 'id': video_id,
3769                 'url': video_url,
3770                 'ext': video_ext,
3771                 'title': data['title'],
3772                 'description': data.get('teaser_text'),
3773                 'location': data.get('country_of_origin'),
3774                 'uploader': data.get('host', {}).get('name'),
3775                 'uploader_id': data.get('host', {}).get('slug'),
3776                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3777                 'duration': data.get('duration'),
3778         }
3779         return [info]
3780
3781
3782 class YouPornIE(InfoExtractor):
3783     """Information extractor for youporn.com."""
3784     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3785
3786     def _print_formats(self, formats):
3787         """Print all available formats"""
3788         print(u'Available formats:')
3789         print(u'ext\t\tformat')
3790         print(u'---------------------------------')
3791         for format in formats:
3792             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3793
3794     def _specific(self, req_format, formats):
3795         for x in formats:
3796             if(x["format"]==req_format):
3797                 return x
3798         return None
3799
3800     def _real_extract(self, url):
3801         mobj = re.match(self._VALID_URL, url)
3802         if mobj is None:
3803             self._downloader.report_error(u'invalid URL: %s' % url)
3804             return
3805
3806         video_id = mobj.group('videoid')
3807
3808         req = compat_urllib_request.Request(url)
3809         req.add_header('Cookie', 'age_verified=1')
3810         webpage = self._download_webpage(req, video_id)
3811
3812         # Get the video title
3813         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3814         if result is None:
3815             raise ExtractorError(u'Unable to extract video title')
3816         video_title = result.group('title').strip()
3817
3818         # Get the video date
3819         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3820         if result is None:
3821             self._downloader.report_warning(u'unable to extract video date')
3822             upload_date = None
3823         else:
3824             upload_date = result.group('date').strip()
3825
3826         # Get the video uploader
3827         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3828         if result is None:
3829             self._downloader.report_warning(u'unable to extract uploader')
3830             video_uploader = None
3831         else:
3832             video_uploader = result.group('uploader').strip()
3833             video_uploader = clean_html( video_uploader )
3834
3835         # Get all of the formats available
3836         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3837         result = re.search(DOWNLOAD_LIST_RE, webpage)
3838         if result is None:
3839             raise ExtractorError(u'Unable to extract download list')
3840         download_list_html = result.group('download_list').strip()
3841
3842         # Get all of the links from the page
3843         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3844         links = re.findall(LINK_RE, download_list_html)
3845         if(len(links) == 0):
3846             raise ExtractorError(u'ERROR: no known formats available for video')
3847
3848         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3849
3850         formats = []
3851         for link in links:
3852
3853             # A link looks like this:
3854             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3855             # A path looks like this:
3856             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3857             video_url = unescapeHTML( link )
3858             path = compat_urllib_parse_urlparse( video_url ).path
3859             extension = os.path.splitext( path )[1][1:]
3860             format = path.split('/')[4].split('_')[:2]
3861             size = format[0]
3862             bitrate = format[1]
3863             format = "-".join( format )
3864             title = u'%s-%s-%s' % (video_title, size, bitrate)
3865
3866             formats.append({
3867                 'id': video_id,
3868                 'url': video_url,
3869                 'uploader': video_uploader,
3870                 'upload_date': upload_date,
3871                 'title': title,
3872                 'ext': extension,
3873                 'format': format,
3874                 'thumbnail': None,
3875                 'description': None,
3876                 'player_url': None
3877             })
3878
3879         if self._downloader.params.get('listformats', None):
3880             self._print_formats(formats)
3881             return
3882
3883         req_format = self._downloader.params.get('format', None)
3884         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3885
3886         if req_format is None or req_format == 'best':
3887             return [formats[0]]
3888         elif req_format == 'worst':
3889             return [formats[-1]]
3890         elif req_format in ('-1', 'all'):
3891             return formats
3892         else:
3893             format = self._specific( req_format, formats )
3894             if result is None:
3895                 self._downloader.report_error(u'requested format not available')
3896                 return
3897             return [format]
3898
3899
3900
3901 class PornotubeIE(InfoExtractor):
3902     """Information extractor for pornotube.com."""
3903     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3904
3905     def _real_extract(self, url):
3906         mobj = re.match(self._VALID_URL, url)
3907         if mobj is None:
3908             self._downloader.report_error(u'invalid URL: %s' % url)
3909             return
3910
3911         video_id = mobj.group('videoid')
3912         video_title = mobj.group('title')
3913
3914         # Get webpage content
3915         webpage = self._download_webpage(url, video_id)
3916
3917         # Get the video URL
3918         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3919         result = re.search(VIDEO_URL_RE, webpage)
3920         if result is None:
3921             self._downloader.report_error(u'unable to extract video url')
3922             return
3923         video_url = compat_urllib_parse.unquote(result.group('url'))
3924
3925         #Get the uploaded date
3926         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3927         result = re.search(VIDEO_UPLOADED_RE, webpage)
3928         if result is None:
3929             self._downloader.report_error(u'unable to extract video title')
3930             return
3931         upload_date = result.group('date')
3932
3933         info = {'id': video_id,
3934                 'url': video_url,
3935                 'uploader': None,
3936                 'upload_date': upload_date,
3937                 'title': video_title,
3938                 'ext': 'flv',
3939                 'format': 'flv'}
3940
3941         return [info]
3942
3943 class YouJizzIE(InfoExtractor):
3944     """Information extractor for youjizz.com."""
3945     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3946
3947     def _real_extract(self, url):
3948         mobj = re.match(self._VALID_URL, url)
3949         if mobj is None:
3950             self._downloader.report_error(u'invalid URL: %s' % url)
3951             return
3952
3953         video_id = mobj.group('videoid')
3954
3955         # Get webpage content
3956         webpage = self._download_webpage(url, video_id)
3957
3958         # Get the video title
3959         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3960         if result is None:
3961             raise ExtractorError(u'ERROR: unable to extract video title')
3962         video_title = result.group('title').strip()
3963
3964         # Get the embed page
3965         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3966         if result is None:
3967             raise ExtractorError(u'ERROR: unable to extract embed page')
3968
3969         embed_page_url = result.group(0).strip()
3970         video_id = result.group('videoid')
3971
3972         webpage = self._download_webpage(embed_page_url, video_id)
3973
3974         # Get the video URL
3975         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3976         if result is None:
3977             raise ExtractorError(u'ERROR: unable to extract video url')
3978         video_url = result.group('source')
3979
3980         info = {'id': video_id,
3981                 'url': video_url,
3982                 'title': video_title,
3983                 'ext': 'flv',
3984                 'format': 'flv',
3985                 'player_url': embed_page_url}
3986
3987         return [info]
3988
3989 class EightTracksIE(InfoExtractor):
3990     IE_NAME = '8tracks'
3991     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3992
3993     def _real_extract(self, url):
3994         mobj = re.match(self._VALID_URL, url)
3995         if mobj is None:
3996             raise ExtractorError(u'Invalid URL: %s' % url)
3997         playlist_id = mobj.group('id')
3998
3999         webpage = self._download_webpage(url, playlist_id)
4000
4001         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4002         if not m:
4003             raise ExtractorError(u'Cannot find trax information')
4004         json_like = m.group(1)
4005         data = json.loads(json_like)
4006
4007         session = str(random.randint(0, 1000000000))
4008         mix_id = data['id']
4009         track_count = data['tracks_count']
4010         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4011         next_url = first_url
4012         res = []
4013         for i in itertools.count():
4014             api_json = self._download_webpage(next_url, playlist_id,
4015                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4016                 errnote=u'Failed to download song information')
4017             api_data = json.loads(api_json)
4018             track_data = api_data[u'set']['track']
4019             info = {
4020                 'id': track_data['id'],
4021                 'url': track_data['track_file_stream_url'],
4022                 'title': track_data['performer'] + u' - ' + track_data['name'],
4023                 'raw_title': track_data['name'],
4024                 'uploader_id': data['user']['login'],
4025                 'ext': 'm4a',
4026             }
4027             res.append(info)
4028             if api_data['set']['at_last_track']:
4029                 break
4030             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4031         return res
4032
4033 class KeekIE(InfoExtractor):
4034     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4035     IE_NAME = u'keek'
4036
4037     def _real_extract(self, url):
4038         m = re.match(self._VALID_URL, url)
4039         video_id = m.group('videoID')
4040         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4041         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4042         webpage = self._download_webpage(url, video_id)
4043         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4044         title = unescapeHTML(m.group('title'))
4045         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4046         uploader = clean_html(m.group('uploader'))
4047         info = {
4048                 'id': video_id,
4049                 'url': video_url,
4050                 'ext': 'mp4',
4051                 'title': title,
4052                 'thumbnail': thumbnail,
4053                 'uploader': uploader
4054         }
4055         return [info]
4056
4057 class TEDIE(InfoExtractor):
4058     _VALID_URL=r'''http://www.ted.com/
4059                    (
4060                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4061                         |
4062                         ((?P<type_talk>talks)) # We have a simple talk
4063                    )
4064                    /(?P<name>\w+) # Here goes the name and then ".html"
4065                    '''
4066
4067     @classmethod
4068     def suitable(cls, url):
4069         """Receives a URL and returns True if suitable for this IE."""
4070         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4071
4072     def _real_extract(self, url):
4073         m=re.match(self._VALID_URL, url, re.VERBOSE)
4074         if m.group('type_talk'):
4075             return [self._talk_info(url)]
4076         else :
4077             playlist_id=m.group('playlist_id')
4078             name=m.group('name')
4079             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4080             return self._playlist_videos_info(url,name,playlist_id)
4081
4082     def _talk_video_link(self,mediaSlug):
4083         '''Returns the video link for that mediaSlug'''
4084         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4085
4086     def _playlist_videos_info(self,url,name,playlist_id=0):
4087         '''Returns the videos of the playlist'''
4088         video_RE=r'''
4089                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4090                      ([.\s]*?)data-playlist_item_id="(\d+)"
4091                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4092                      '''
4093         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4094         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4095         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4096         m_names=re.finditer(video_name_RE,webpage)
4097         info=[]
4098         for m_video, m_name in zip(m_videos,m_names):
4099             video_id=m_video.group('video_id')
4100             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4101             info.append(self._talk_info(talk_url,video_id))
4102         return info
4103
4104     def _talk_info(self, url, video_id=0):
4105         """Return the video for the talk in the url"""
4106         m=re.match(self._VALID_URL, url,re.VERBOSE)
4107         videoName=m.group('name')
4108         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4109         # If the url includes the language we get the title translated
4110         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4111         title=re.search(title_RE, webpage).group('title')
4112         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4113                         "id":(?P<videoID>[\d]+).*?
4114                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4115         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4116         thumb_match=re.search(thumb_RE,webpage)
4117         info_match=re.search(info_RE,webpage,re.VERBOSE)
4118         video_id=info_match.group('videoID')
4119         mediaSlug=info_match.group('mediaSlug')
4120         video_url=self._talk_video_link(mediaSlug)
4121         info = {
4122                 'id': video_id,
4123                 'url': video_url,
4124                 'ext': 'mp4',
4125                 'title': title,
4126                 'thumbnail': thumb_match.group('thumbnail')
4127                 }
4128         return info
4129
4130 class MySpassIE(InfoExtractor):
4131     _VALID_URL = r'http://www.myspass.de/.*'
4132
4133     def _real_extract(self, url):
4134         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4135
4136         # video id is the last path element of the URL
4137         # usually there is a trailing slash, so also try the second but last
4138         url_path = compat_urllib_parse_urlparse(url).path
4139         url_parent_path, video_id = os.path.split(url_path)
4140         if not video_id:
4141             _, video_id = os.path.split(url_parent_path)
4142
4143         # get metadata
4144         metadata_url = META_DATA_URL_TEMPLATE % video_id
4145         metadata_text = self._download_webpage(metadata_url, video_id)
4146         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4147
4148         # extract values from metadata
4149         url_flv_el = metadata.find('url_flv')
4150         if url_flv_el is None:
4151             self._downloader.report_error(u'unable to extract download url')
4152             return
4153         video_url = url_flv_el.text
4154         extension = os.path.splitext(video_url)[1][1:]
4155         title_el = metadata.find('title')
4156         if title_el is None:
4157             self._downloader.report_error(u'unable to extract title')
4158             return
4159         title = title_el.text
4160         format_id_el = metadata.find('format_id')
4161         if format_id_el is None:
4162             format = ext
4163         else:
4164             format = format_id_el.text
4165         description_el = metadata.find('description')
4166         if description_el is not None:
4167             description = description_el.text
4168         else:
4169             description = None
4170         imagePreview_el = metadata.find('imagePreview')
4171         if imagePreview_el is not None:
4172             thumbnail = imagePreview_el.text
4173         else:
4174             thumbnail = None
4175         info = {
4176             'id': video_id,
4177             'url': video_url,
4178             'title': title,
4179             'ext': extension,
4180             'format': format,
4181             'thumbnail': thumbnail,
4182             'description': description
4183         }
4184         return [info]
4185
4186 class SpiegelIE(InfoExtractor):
4187     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4188
4189     def _real_extract(self, url):
4190         m = re.match(self._VALID_URL, url)
4191         video_id = m.group('videoID')
4192
4193         webpage = self._download_webpage(url, video_id)
4194         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4195         if not m:
4196             raise ExtractorError(u'Cannot find title')
4197         video_title = unescapeHTML(m.group(1))
4198
4199         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4200         xml_code = self._download_webpage(xml_url, video_id,
4201                     note=u'Downloading XML', errnote=u'Failed to download XML')
4202
4203         idoc = xml.etree.ElementTree.fromstring(xml_code)
4204         last_type = idoc[-1]
4205         filename = last_type.findall('./filename')[0].text
4206         duration = float(last_type.findall('./duration')[0].text)
4207
4208         video_url = 'http://video2.spiegel.de/flash/' + filename
4209         video_ext = filename.rpartition('.')[2]
4210         info = {
4211             'id': video_id,
4212             'url': video_url,
4213             'ext': video_ext,
4214             'title': video_title,
4215             'duration': duration,
4216         }
4217         return [info]
4218
4219 class LiveLeakIE(InfoExtractor):
4220
4221     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4222     IE_NAME = u'liveleak'
4223
4224     def _real_extract(self, url):
4225         mobj = re.match(self._VALID_URL, url)
4226         if mobj is None:
4227             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4228             return
4229
4230         video_id = mobj.group('video_id')
4231
4232         webpage = self._download_webpage(url, video_id)
4233
4234         m = re.search(r'file: "(.*?)",', webpage)
4235         if not m:
4236             self._downloader.report_error(u'unable to find video url')
4237             return
4238         video_url = m.group(1)
4239
4240         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4241         if not m:
4242             self._downloader.trouble(u'Cannot find video title')
4243         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4244
4245         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4246         if m:
4247             desc = unescapeHTML(m.group('desc'))
4248         else:
4249             desc = None
4250
4251         m = re.search(r'By:.*?(\w+)</a>', webpage)
4252         if m:
4253             uploader = clean_html(m.group(1))
4254         else:
4255             uploader = None
4256
4257         info = {
4258             'id':  video_id,
4259             'url': video_url,
4260             'ext': 'mp4',
4261             'title': title,
4262             'description': desc,
4263             'uploader': uploader
4264         }
4265
4266         return [info]
4267
4268
4269 def gen_extractors():
4270     """ Return a list of an instance of every supported extractor.
4271     The order does matter; the first extractor matched is the one handling the URL.
4272     """
4273     return [
4274         YoutubePlaylistIE(),
4275         YoutubeChannelIE(),
4276         YoutubeUserIE(),
4277         YoutubeSearchIE(),
4278         YoutubeIE(),
4279         MetacafeIE(),
4280         DailymotionIE(),
4281         GoogleSearchIE(),
4282         PhotobucketIE(),
4283         YahooIE(),
4284         YahooSearchIE(),
4285         DepositFilesIE(),
4286         FacebookIE(),
4287         BlipTVUserIE(),
4288         BlipTVIE(),
4289         VimeoIE(),
4290         MyVideoIE(),
4291         ComedyCentralIE(),
4292         EscapistIE(),
4293         CollegeHumorIE(),
4294         XVideosIE(),
4295         SoundcloudIE(),
4296         InfoQIE(),
4297         MixcloudIE(),
4298         StanfordOpenClassroomIE(),
4299         MTVIE(),
4300         YoukuIE(),
4301         XNXXIE(),
4302         YouJizzIE(),
4303         PornotubeIE(),
4304         YouPornIE(),
4305         GooglePlusIE(),
4306         ArteTvIE(),
4307         NBAIE(),
4308         WorldStarHipHopIE(),
4309         JustinTVIE(),
4310         FunnyOrDieIE(),
4311         SteamIE(),
4312         UstreamIE(),
4313         RBMARadioIE(),
4314         EightTracksIE(),
4315         KeekIE(),
4316         TEDIE(),
4317         MySpassIE(),
4318         SpiegelIE(),
4319         LiveLeakIE(),
4320         GenericIE()
4321     ]