Modified Youtube video/playlist matching; fixes #668; fixes #585
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The .srt file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         webpage_bytes = urlh.read()
130         return webpage_bytes.decode('utf-8', 'replace')
131
132
133 class YoutubeIE(InfoExtractor):
134     """Information extractor for youtube.com."""
135
136     _VALID_URL = r"""^
137                      (
138                          (?:https?://)?                                       # http(s):// (optional)
139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
142                          (?:                                                  # the various things that can precede the ID:
143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
144                              |(?:                                             # or the v= param in all its forms
145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
148                                  v=
149                              )
150                          )?                                                   # optional -> youtube.com/xxxx is OK
151                      )?                                                       # all until now is optional -> you can pass the naked ID
152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
153                      (?(1).+)?                                                # if we found the ID, everything can follow
154                      $"""
155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159     _NETRC_MACHINE = 'youtube'
160     # Listed in order of quality
161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163     _video_extensions = {
164         '13': '3gp',
165         '17': 'mp4',
166         '18': 'mp4',
167         '22': 'mp4',
168         '37': 'mp4',
169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
170         '43': 'webm',
171         '44': 'webm',
172         '45': 'webm',
173         '46': 'webm',
174     }
175     _video_dimensions = {
176         '5': '240x400',
177         '6': '???',
178         '13': '???',
179         '17': '144x176',
180         '18': '360x640',
181         '22': '720x1280',
182         '34': '360x640',
183         '35': '480x854',
184         '37': '1080x1920',
185         '38': '3072x4096',
186         '43': '360x640',
187         '44': '480x854',
188         '45': '720x1280',
189         '46': '1080x1920',
190     }
191     IE_NAME = u'youtube'
192
193     @classmethod
194     def suitable(cls, url):
195         """Receives a URL and returns True if suitable for this IE."""
196         if YoutubePlaylistIE.suitable(url): return False
197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
198
199     def report_lang(self):
200         """Report attempt to set language."""
201         self._downloader.to_screen(u'[youtube] Setting language')
202
203     def report_login(self):
204         """Report attempt to log in."""
205         self._downloader.to_screen(u'[youtube] Logging in')
206
207     def report_age_confirmation(self):
208         """Report attempt to confirm age."""
209         self._downloader.to_screen(u'[youtube] Confirming age')
210
211     def report_video_webpage_download(self, video_id):
212         """Report attempt to download video webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
214
215     def report_video_info_webpage_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
218
219     def report_video_subtitles_download(self, video_id):
220         """Report attempt to download video info webpage."""
221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
222
223     def report_information_extraction(self, video_id):
224         """Report attempt to extract video information."""
225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227     def report_unavailable_format(self, video_id, format):
228         """Report extracted video URL."""
229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231     def report_rtmp_download(self):
232         """Indicate the download will use the RTMP protocol."""
233         self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235     def _closed_captions_xml_to_srt(self, xml_string):
236         srt = ''
237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238         # TODO parse xml instead of regex
239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
240             if not dur: dur = '4'
241             start = float(start)
242             end = start + float(dur)
243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245             caption = unescapeHTML(caption)
246             caption = unescapeHTML(caption) # double cycle, intentional
247             srt += str(n+1) + '\n'
248             srt += start + ' --> ' + end + '\n'
249             srt += caption + '\n\n'
250         return srt
251
252     def _extract_subtitles(self, video_id):
253         self.report_video_subtitles_download(video_id)
254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
255         try:
256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261         if not srt_lang_list:
262             return (u'WARNING: video has no closed captions', None)
263         if self._downloader.params.get('subtitleslang', False):
264             srt_lang = self._downloader.params.get('subtitleslang')
265         elif 'en' in srt_lang_list:
266             srt_lang = 'en'
267         else:
268             srt_lang = list(srt_lang_list.keys())[0]
269         if not srt_lang in srt_lang_list:
270             return (u'WARNING: no closed captions found in the specified language', None)
271         params = compat_urllib_parse.urlencode({
272             'lang': srt_lang,
273             'name': srt_lang_list[srt_lang].encode('utf-8'),
274             'v': video_id,
275         })
276         url = 'http://www.youtube.com/api/timedtext?' + params
277         try:
278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281         if not srt_xml:
282             return (u'WARNING: Did not fetch video subtitles', None)
283         return (None, self._closed_captions_xml_to_srt(srt_xml))
284
285     def _print_formats(self, formats):
286         print('Available formats:')
287         for x in formats:
288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
289
290     def _real_initialize(self):
291         if self._downloader is None:
292             return
293
294         username = None
295         password = None
296         downloader_params = self._downloader.params
297
298         # Attempt to use provided username and password or .netrc data
299         if downloader_params.get('username', None) is not None:
300             username = downloader_params['username']
301             password = downloader_params['password']
302         elif downloader_params.get('usenetrc', False):
303             try:
304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305                 if info is not None:
306                     username = info[0]
307                     password = info[2]
308                 else:
309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310             except (IOError, netrc.NetrcParseError) as err:
311                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
312                 return
313
314         # Set language
315         request = compat_urllib_request.Request(self._LANG_URL)
316         try:
317             self.report_lang()
318             compat_urllib_request.urlopen(request).read()
319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
321             return
322
323         # No authentication to be performed
324         if username is None:
325             return
326
327         request = compat_urllib_request.Request(self._LOGIN_URL)
328         try:
329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332             return
333
334         galx = None
335         dsh = None
336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           galx = match.group(1)
339
340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
341         if match:
342           dsh = match.group(1)
343
344         # Log in
345         login_form_strs = {
346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
347                 u'Email': username,
348                 u'GALX': galx,
349                 u'Passwd': password,
350                 u'PersistentCookie': u'yes',
351                 u'_utf8': u'霱',
352                 u'bgresponse': u'js_disabled',
353                 u'checkConnection': u'',
354                 u'checkedDomains': u'youtube',
355                 u'dnConn': u'',
356                 u'dsh': dsh,
357                 u'pstMsg': u'0',
358                 u'rmShown': u'1',
359                 u'secTok': u'',
360                 u'signIn': u'Sign in',
361                 u'timeStmp': u'',
362                 u'service': u'youtube',
363                 u'uilel': u'3',
364                 u'hl': u'en_US',
365         }
366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
367         # chokes on unicode
368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
371         try:
372             self.report_login()
373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
376                 return
377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
379             return
380
381         # Confirm age
382         age_form = {
383                 'next_url':     '/',
384                 'action_confirm':   'Confirm',
385                 }
386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
387         try:
388             self.report_age_confirmation()
389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
392             return
393
394     def _extract_id(self, url):
395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
396         if mobj is None:
397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
398             return
399         video_id = mobj.group(2)
400         return video_id
401
402     def _real_extract(self, url):
403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404         mobj = re.search(self._NEXT_URL_RE, url)
405         if mobj:
406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407         video_id = self._extract_id(url)
408
409         # Get video webpage
410         self.report_video_webpage_download(video_id)
411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412         request = compat_urllib_request.Request(url)
413         try:
414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
417             return
418
419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
420
421         # Attempt to extract SWF player URL
422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
423         if mobj is not None:
424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425         else:
426             player_url = None
427
428         # Get video info
429         self.report_video_info_webpage_download(video_id)
430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432                     % (video_id, el_type))
433             request = compat_urllib_request.Request(video_info_url)
434             try:
435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437                 video_info = compat_parse_qs(video_info_webpage)
438                 if 'token' in video_info:
439                     break
440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
442                 return
443         if 'token' not in video_info:
444             if 'reason' in video_info:
445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
446             else:
447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
448             return
449
450         # Check for "rental" videos
451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
453             return
454
455         # Start extracting information
456         self.report_information_extraction(video_id)
457
458         # uploader
459         if 'author' not in video_info:
460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
461             return
462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
463
464         # uploader_id
465         video_uploader_id = None
466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
467         if mobj is not None:
468             video_uploader_id = mobj.group(1)
469         else:
470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
471
472         # title
473         if 'title' not in video_info:
474             self._downloader.trouble(u'ERROR: unable to extract video title')
475             return
476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
477
478         # thumbnail image
479         if 'thumbnail_url' not in video_info:
480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
481             video_thumbnail = ''
482         else:   # don't panic if we can't find it
483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
484
485         # upload date
486         upload_date = None
487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
488         if mobj is not None:
489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491             for expression in format_expressions:
492                 try:
493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494                 except:
495                     pass
496
497         # description
498         video_description = get_element_by_id("eow-description", video_webpage)
499         if video_description:
500             video_description = clean_html(video_description)
501         else:
502             video_description = ''
503
504         # closed captions
505         video_subtitles = None
506         if self._downloader.params.get('writesubtitles', False):
507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
508             if srt_error:
509                 self._downloader.trouble(srt_error)
510
511         if 'length_seconds' not in video_info:
512             self._downloader.trouble(u'WARNING: unable to extract video duration')
513             video_duration = ''
514         else:
515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
516
517         # token
518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
519
520         # Decide which formats to download
521         req_format = self._downloader.params.get('format', None)
522
523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524             self.report_rtmp_download()
525             video_url_list = [(None, video_info['conn'][0])]
526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
531
532             format_limit = self._downloader.params.get('format_limit', None)
533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534             if format_limit is not None and format_limit in available_formats:
535                 format_list = available_formats[available_formats.index(format_limit):]
536             else:
537                 format_list = available_formats
538             existing_formats = [x for x in format_list if x in url_map]
539             if len(existing_formats) == 0:
540                 self._downloader.trouble(u'ERROR: no known formats available for video')
541                 return
542             if self._downloader.params.get('listformats', None):
543                 self._print_formats(existing_formats)
544                 return
545             if req_format is None or req_format == 'best':
546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547             elif req_format == 'worst':
548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549             elif req_format in ('-1', 'all'):
550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
551             else:
552                 # Specific formats. We pick the first in a slash-delimeted sequence.
553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554                 req_formats = req_format.split('/')
555                 video_url_list = None
556                 for rf in req_formats:
557                     if rf in url_map:
558                         video_url_list = [(rf, url_map[rf])]
559                         break
560                 if video_url_list is None:
561                     self._downloader.trouble(u'ERROR: requested format not available')
562                     return
563         else:
564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
565             return
566
567         results = []
568         for format_param, video_real_url in video_url_list:
569             # Extension
570             video_extension = self._video_extensions.get(format_param, 'flv')
571
572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573                                               self._video_dimensions.get(format_param, '???'))
574
575             results.append({
576                 'id':       video_id,
577                 'url':      video_real_url,
578                 'uploader': video_uploader,
579                 'uploader_id': video_uploader_id,
580                 'upload_date':  upload_date,
581                 'title':    video_title,
582                 'ext':      video_extension,
583                 'format':   video_format,
584                 'thumbnail':    video_thumbnail,
585                 'description':  video_description,
586                 'player_url':   player_url,
587                 'subtitles':    video_subtitles,
588                 'duration':     video_duration
589             })
590         return results
591
592
593 class MetacafeIE(InfoExtractor):
594     """Information Extractor for metacafe.com."""
595
596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599     IE_NAME = u'metacafe'
600
601     def __init__(self, downloader=None):
602         InfoExtractor.__init__(self, downloader)
603
604     def report_disclaimer(self):
605         """Report disclaimer retrieval."""
606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
607
608     def report_age_confirmation(self):
609         """Report attempt to confirm age."""
610         self._downloader.to_screen(u'[metacafe] Confirming age')
611
612     def report_download_webpage(self, video_id):
613         """Report webpage download."""
614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
615
616     def report_extraction(self, video_id):
617         """Report information extraction."""
618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
619
620     def _real_initialize(self):
621         # Retrieve disclaimer
622         request = compat_urllib_request.Request(self._DISCLAIMER)
623         try:
624             self.report_disclaimer()
625             disclaimer = compat_urllib_request.urlopen(request).read()
626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
628             return
629
630         # Confirm age
631         disclaimer_form = {
632             'filters': '0',
633             'submit': "Continue - I'm over 18",
634             }
635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
636         try:
637             self.report_age_confirmation()
638             disclaimer = compat_urllib_request.urlopen(request).read()
639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
641             return
642
643     def _real_extract(self, url):
644         # Extract id and simplified title from URL
645         mobj = re.match(self._VALID_URL, url)
646         if mobj is None:
647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648             return
649
650         video_id = mobj.group(1)
651
652         # Check if video comes from YouTube
653         mobj2 = re.match(r'^yt-(.*)$', video_id)
654         if mobj2 is not None:
655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
656             return
657
658         # Retrieve video webpage to extract further information
659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
660         try:
661             self.report_download_webpage(video_id)
662             webpage = compat_urllib_request.urlopen(request).read()
663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665             return
666
667         # Extract URL, uploader and title from webpage
668         self.report_extraction(video_id)
669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
670         if mobj is not None:
671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672             video_extension = mediaURL[-3:]
673
674             # Extract gdaKey if available
675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
676             if mobj is None:
677                 video_url = mediaURL
678             else:
679                 gdaKey = mobj.group(1)
680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
681         else:
682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
683             if mobj is None:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             vardict = compat_parse_qs(mobj.group(1))
687             if 'mediaData' not in vardict:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
691             if mobj is None:
692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
693                 return
694             mediaURL = mobj.group(1).replace('\\/', '/')
695             video_extension = mediaURL[-3:]
696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
697
698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
699         if mobj is None:
700             self._downloader.trouble(u'ERROR: unable to extract title')
701             return
702         video_title = mobj.group(1).decode('utf-8')
703
704         mobj = re.search(r'submitter=(.*?);', webpage)
705         if mobj is None:
706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
707             return
708         video_uploader = mobj.group(1)
709
710         return [{
711             'id':       video_id.decode('utf-8'),
712             'url':      video_url.decode('utf-8'),
713             'uploader': video_uploader.decode('utf-8'),
714             'upload_date':  None,
715             'title':    video_title,
716             'ext':      video_extension.decode('utf-8'),
717         }]
718
719
720 class DailymotionIE(InfoExtractor):
721     """Information Extractor for Dailymotion"""
722
723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724     IE_NAME = u'dailymotion'
725     _WORKING = False
726
727     def __init__(self, downloader=None):
728         InfoExtractor.__init__(self, downloader)
729
730     def report_extraction(self, video_id):
731         """Report information extraction."""
732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
733
734     def _real_extract(self, url):
735         # Extract id and simplified title from URL
736         mobj = re.match(self._VALID_URL, url)
737         if mobj is None:
738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
739             return
740
741         video_id = mobj.group(1).split('_')[0].split('?')[0]
742
743         video_extension = 'mp4'
744
745         # Retrieve video webpage to extract further information
746         request = compat_urllib_request.Request(url)
747         request.add_header('Cookie', 'family_filter=off')
748         webpage = self._download_webpage(request, video_id)
749
750         # Extract URL, uploader and title from webpage
751         self.report_extraction(video_id)
752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
753         if mobj is None:
754             self._downloader.trouble(u'ERROR: unable to extract media URL')
755             return
756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
757
758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
759             if key in flashvars:
760                 max_quality = key
761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
762                 break
763         else:
764             self._downloader.trouble(u'ERROR: unable to extract video URL')
765             return
766
767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
768         if mobj is None:
769             self._downloader.trouble(u'ERROR: unable to extract video URL')
770             return
771
772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
773
774         # TODO: support choosing qualities
775
776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
777         if mobj is None:
778             self._downloader.trouble(u'ERROR: unable to extract title')
779             return
780         video_title = unescapeHTML(mobj.group('title'))
781
782         video_uploader = None
783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
784         if mobj is None:
785             # lookin for official user
786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787             if mobj_official is None:
788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
789             else:
790                 video_uploader = mobj_official.group(1)
791         else:
792             video_uploader = mobj.group(1)
793
794         video_upload_date = None
795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
796         if mobj is not None:
797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798
799         return [{
800             'id':       video_id,
801             'url':      video_url,
802             'uploader': video_uploader,
803             'upload_date':  video_upload_date,
804             'title':    video_title,
805             'ext':      video_extension,
806         }]
807
808
809 class PhotobucketIE(InfoExtractor):
810     """Information extractor for photobucket.com."""
811
812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813     IE_NAME = u'photobucket'
814
815     def __init__(self, downloader=None):
816         InfoExtractor.__init__(self, downloader)
817
818     def report_download_webpage(self, video_id):
819         """Report webpage download."""
820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
821
822     def report_extraction(self, video_id):
823         """Report information extraction."""
824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
825
826     def _real_extract(self, url):
827         # Extract id from URL
828         mobj = re.match(self._VALID_URL, url)
829         if mobj is None:
830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
831             return
832
833         video_id = mobj.group(1)
834
835         video_extension = 'flv'
836
837         # Retrieve video webpage to extract further information
838         request = compat_urllib_request.Request(url)
839         try:
840             self.report_download_webpage(video_id)
841             webpage = compat_urllib_request.urlopen(request).read()
842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
844             return
845
846         # Extract URL, uploader, and title from webpage
847         self.report_extraction(video_id)
848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
849         if mobj is None:
850             self._downloader.trouble(u'ERROR: unable to extract media URL')
851             return
852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
853
854         video_url = mediaURL
855
856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
857         if mobj is None:
858             self._downloader.trouble(u'ERROR: unable to extract title')
859             return
860         video_title = mobj.group(1).decode('utf-8')
861
862         video_uploader = mobj.group(2).decode('utf-8')
863
864         return [{
865             'id':       video_id.decode('utf-8'),
866             'url':      video_url.decode('utf-8'),
867             'uploader': video_uploader,
868             'upload_date':  None,
869             'title':    video_title,
870             'ext':      video_extension.decode('utf-8'),
871         }]
872
873
874 class YahooIE(InfoExtractor):
875     """Information extractor for video.yahoo.com."""
876
877     _WORKING = False
878     # _VALID_URL matches all Yahoo! Video URLs
879     # _VPAGE_URL matches only the extractable '/watch/' URLs
880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882     IE_NAME = u'video.yahoo'
883
884     def __init__(self, downloader=None):
885         InfoExtractor.__init__(self, downloader)
886
887     def report_download_webpage(self, video_id):
888         """Report webpage download."""
889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
890
891     def report_extraction(self, video_id):
892         """Report information extraction."""
893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
894
895     def _real_extract(self, url, new_video=True):
896         # Extract ID from URL
897         mobj = re.match(self._VALID_URL, url)
898         if mobj is None:
899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
900             return
901
902         video_id = mobj.group(2)
903         video_extension = 'flv'
904
905         # Rewrite valid but non-extractable URLs as
906         # extractable English language /watch/ URLs
907         if re.match(self._VPAGE_URL, url) is None:
908             request = compat_urllib_request.Request(url)
909             try:
910                 webpage = compat_urllib_request.urlopen(request).read()
911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
913                 return
914
915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
916             if mobj is None:
917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
918                 return
919             yahoo_id = mobj.group(1)
920
921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
922             if mobj is None:
923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
924                 return
925             yahoo_vid = mobj.group(1)
926
927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928             return self._real_extract(url, new_video=False)
929
930         # Retrieve video webpage to extract further information
931         request = compat_urllib_request.Request(url)
932         try:
933             self.report_download_webpage(video_id)
934             webpage = compat_urllib_request.urlopen(request).read()
935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
937             return
938
939         # Extract uploader and title from webpage
940         self.report_extraction(video_id)
941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
942         if mobj is None:
943             self._downloader.trouble(u'ERROR: unable to extract video title')
944             return
945         video_title = mobj.group(1).decode('utf-8')
946
947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
948         if mobj is None:
949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
950             return
951         video_uploader = mobj.group(1).decode('utf-8')
952
953         # Extract video thumbnail
954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
955         if mobj is None:
956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
957             return
958         video_thumbnail = mobj.group(1).decode('utf-8')
959
960         # Extract video description
961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: unable to extract video description')
964             return
965         video_description = mobj.group(1).decode('utf-8')
966         if not video_description:
967             video_description = 'No description available.'
968
969         # Extract video height and width
970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
971         if mobj is None:
972             self._downloader.trouble(u'ERROR: unable to extract video height')
973             return
974         yv_video_height = mobj.group(1)
975
976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
977         if mobj is None:
978             self._downloader.trouble(u'ERROR: unable to extract video width')
979             return
980         yv_video_width = mobj.group(1)
981
982         # Retrieve video playlist to extract media URL
983         # I'm not completely sure what all these options are, but we
984         # seem to need most of them, otherwise the server sends a 401.
985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
990         try:
991             self.report_download_webpage(video_id)
992             webpage = compat_urllib_request.urlopen(request).read()
993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995             return
996
997         # Extract media URL from playlist XML
998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             new_ids = list(video['id'] for video in api_response['items'])
1486             video_ids += new_ids
1487
1488             limit = min(n, api_response['totalItems'])
1489             pagenum += 1
1490
1491         if len(video_ids) > n:
1492             video_ids = video_ids[:n]
1493         for id in video_ids:
1494             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1495         return
1496
1497
1498 class GoogleSearchIE(InfoExtractor):
1499     """Information Extractor for Google Video search queries."""
1500     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504     _max_google_results = 1000
1505     IE_NAME = u'video.google:search'
1506
1507     def __init__(self, downloader=None):
1508         InfoExtractor.__init__(self, downloader)
1509
1510     def report_download_page(self, query, pagenum):
1511         """Report attempt to download playlist page with given number."""
1512         query = query.decode(preferredencoding())
1513         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1514
1515     def _real_extract(self, query):
1516         mobj = re.match(self._VALID_URL, query)
1517         if mobj is None:
1518             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1519             return
1520
1521         prefix, query = query.split(':')
1522         prefix = prefix[8:]
1523         query = query.encode('utf-8')
1524         if prefix == '':
1525             self._download_n_results(query, 1)
1526             return
1527         elif prefix == 'all':
1528             self._download_n_results(query, self._max_google_results)
1529             return
1530         else:
1531             try:
1532                 n = int(prefix)
1533                 if n <= 0:
1534                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1535                     return
1536                 elif n > self._max_google_results:
1537                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1538                     n = self._max_google_results
1539                 self._download_n_results(query, n)
1540                 return
1541             except ValueError: # parsing prefix as integer fails
1542                 self._download_n_results(query, 1)
1543                 return
1544
1545     def _download_n_results(self, query, n):
1546         """Downloads a specified number of results for a query"""
1547
1548         video_ids = []
1549         pagenum = 0
1550
1551         while True:
1552             self.report_download_page(query, pagenum)
1553             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1554             request = compat_urllib_request.Request(result_url)
1555             try:
1556                 page = compat_urllib_request.urlopen(request).read()
1557             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1558                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1559                 return
1560
1561             # Extract video identifiers
1562             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1563                 video_id = mobj.group(1)
1564                 if video_id not in video_ids:
1565                     video_ids.append(video_id)
1566                     if len(video_ids) == n:
1567                         # Specified n videos reached
1568                         for id in video_ids:
1569                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1570                         return
1571
1572             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573                 for id in video_ids:
1574                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575                 return
1576
1577             pagenum = pagenum + 1
1578
1579
1580 class YahooSearchIE(InfoExtractor):
1581     """Information Extractor for Yahoo! Video search queries."""
1582
1583     _WORKING = False
1584     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587     _MORE_PAGES_INDICATOR = r'\s*Next'
1588     _max_yahoo_results = 1000
1589     IE_NAME = u'video.yahoo:search'
1590
1591     def __init__(self, downloader=None):
1592         InfoExtractor.__init__(self, downloader)
1593
1594     def report_download_page(self, query, pagenum):
1595         """Report attempt to download playlist page with given number."""
1596         query = query.decode(preferredencoding())
1597         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1598
1599     def _real_extract(self, query):
1600         mobj = re.match(self._VALID_URL, query)
1601         if mobj is None:
1602             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1603             return
1604
1605         prefix, query = query.split(':')
1606         prefix = prefix[8:]
1607         query = query.encode('utf-8')
1608         if prefix == '':
1609             self._download_n_results(query, 1)
1610             return
1611         elif prefix == 'all':
1612             self._download_n_results(query, self._max_yahoo_results)
1613             return
1614         else:
1615             try:
1616                 n = int(prefix)
1617                 if n <= 0:
1618                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619                     return
1620                 elif n > self._max_yahoo_results:
1621                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622                     n = self._max_yahoo_results
1623                 self._download_n_results(query, n)
1624                 return
1625             except ValueError: # parsing prefix as integer fails
1626                 self._download_n_results(query, 1)
1627                 return
1628
1629     def _download_n_results(self, query, n):
1630         """Downloads a specified number of results for a query"""
1631
1632         video_ids = []
1633         already_seen = set()
1634         pagenum = 1
1635
1636         while True:
1637             self.report_download_page(query, pagenum)
1638             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639             request = compat_urllib_request.Request(result_url)
1640             try:
1641                 page = compat_urllib_request.urlopen(request).read()
1642             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1644                 return
1645
1646             # Extract video identifiers
1647             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648                 video_id = mobj.group(1)
1649                 if video_id not in already_seen:
1650                     video_ids.append(video_id)
1651                     already_seen.add(video_id)
1652                     if len(video_ids) == n:
1653                         # Specified n videos reached
1654                         for id in video_ids:
1655                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656                         return
1657
1658             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659                 for id in video_ids:
1660                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661                 return
1662
1663             pagenum = pagenum + 1
1664
1665
1666 class YoutubePlaylistIE(InfoExtractor):
1667     """Information Extractor for YouTube playlists."""
1668
1669     _VALID_URL = r"""(?:
1670                         (?:https?://)?
1671                         (?:\w+\.)?
1672                         youtube\.com/
1673                         (?:
1674                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1675                            \? (?:.*?&)*? (?:p|a|list)=
1676                         |  user/.*?/user/
1677                         |  p/
1678                         |  user/.*?#[pg]/c/
1679                         )
1680                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1681                         .*
1682                      |
1683                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1684                      )"""
1685     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1686     _MAX_RESULTS = 50
1687     IE_NAME = u'youtube:playlist'
1688
1689     def __init__(self, downloader=None):
1690         InfoExtractor.__init__(self, downloader)
1691
1692     @classmethod
1693     def suitable(cls, url):
1694         """Receives a URL and returns True if suitable for this IE."""
1695         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1696
1697     def report_download_page(self, playlist_id, pagenum):
1698         """Report attempt to download playlist page with given number."""
1699         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1700
1701     def _real_extract(self, url):
1702         # Extract playlist id
1703         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1704         if mobj is None:
1705             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1706             return
1707
1708         # Download playlist videos from API
1709         playlist_id = mobj.group(1) or mobj.group(2)
1710         page_num = 1
1711         videos = []
1712
1713         while True:
1714             self.report_download_page(playlist_id, page_num)
1715
1716             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1717             try:
1718                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1719             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721                 return
1722
1723             try:
1724                 response = json.loads(page)
1725             except ValueError as err:
1726                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1727                 return
1728
1729             if not 'feed' in response or not 'entry' in response['feed']:
1730                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1731                 return
1732             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733                         for entry in response['feed']['entry']
1734                         if 'content' in entry ]
1735
1736             if len(response['feed']['entry']) < self._MAX_RESULTS:
1737                 break
1738             page_num += 1
1739
1740         videos = map(operator.itemgetter(1), sorted(videos))
1741
1742         total = len(videos)
1743
1744         playliststart = self._downloader.params.get('playliststart', 1) - 1
1745         playlistend = self._downloader.params.get('playlistend', -1)
1746         if playlistend == -1:
1747             videos = videos[playliststart:]
1748         else:
1749             videos = videos[playliststart:playlistend]
1750
1751         if len(videos) == total:
1752             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1753         else:
1754             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1755
1756         for video in videos:
1757             self._downloader.download([video])
1758         return
1759
1760
1761 class YoutubeChannelIE(InfoExtractor):
1762     """Information Extractor for YouTube channels."""
1763
1764     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1765     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1766     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1767     IE_NAME = u'youtube:channel'
1768
1769     def report_download_page(self, channel_id, pagenum):
1770         """Report attempt to download channel page with given number."""
1771         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1772
1773     def _real_extract(self, url):
1774         # Extract channel id
1775         mobj = re.match(self._VALID_URL, url)
1776         if mobj is None:
1777             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1778             return
1779
1780         # Download channel pages
1781         channel_id = mobj.group(1)
1782         video_ids = []
1783         pagenum = 1
1784
1785         while True:
1786             self.report_download_page(channel_id, pagenum)
1787             url = self._TEMPLATE_URL % (channel_id, pagenum)
1788             request = compat_urllib_request.Request(url)
1789             try:
1790                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1791             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793                 return
1794
1795             # Extract video identifiers
1796             ids_in_page = []
1797             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1798                 if mobj.group(1) not in ids_in_page:
1799                     ids_in_page.append(mobj.group(1))
1800             video_ids.extend(ids_in_page)
1801
1802             if self._MORE_PAGES_INDICATOR not in page:
1803                 break
1804             pagenum = pagenum + 1
1805
1806         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1807
1808         for id in video_ids:
1809             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1810         return
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814     """Information Extractor for YouTube users."""
1815
1816     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818     _GDATA_PAGE_SIZE = 50
1819     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821     IE_NAME = u'youtube:user'
1822
1823     def __init__(self, downloader=None):
1824         InfoExtractor.__init__(self, downloader)
1825
1826     def report_download_page(self, username, start_index):
1827         """Report attempt to download user page."""
1828         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1829                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1830
1831     def _real_extract(self, url):
1832         # Extract username
1833         mobj = re.match(self._VALID_URL, url)
1834         if mobj is None:
1835             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1836             return
1837
1838         username = mobj.group(1)
1839
1840         # Download video ids using YouTube Data API. Result size per
1841         # query is limited (currently to 50 videos) so we need to query
1842         # page by page until there are no video ids - it means we got
1843         # all of them.
1844
1845         video_ids = []
1846         pagenum = 0
1847
1848         while True:
1849             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1850             self.report_download_page(username, start_index)
1851
1852             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1853
1854             try:
1855                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1858                 return
1859
1860             # Extract video identifiers
1861             ids_in_page = []
1862
1863             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864                 if mobj.group(1) not in ids_in_page:
1865                     ids_in_page.append(mobj.group(1))
1866
1867             video_ids.extend(ids_in_page)
1868
1869             # A little optimization - if current page is not
1870             # "full", ie. does not contain PAGE_SIZE video ids then
1871             # we can assume that this page is the last one - there
1872             # are no more ids on further pages - no need to query
1873             # again.
1874
1875             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1876                 break
1877
1878             pagenum += 1
1879
1880         all_ids_count = len(video_ids)
1881         playliststart = self._downloader.params.get('playliststart', 1) - 1
1882         playlistend = self._downloader.params.get('playlistend', -1)
1883
1884         if playlistend == -1:
1885             video_ids = video_ids[playliststart:]
1886         else:
1887             video_ids = video_ids[playliststart:playlistend]
1888
1889         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1890                 (username, all_ids_count, len(video_ids)))
1891
1892         for video_id in video_ids:
1893             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1894
1895
1896 class BlipTVUserIE(InfoExtractor):
1897     """Information Extractor for blip.tv users."""
1898
1899     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1900     _PAGE_SIZE = 12
1901     IE_NAME = u'blip.tv:user'
1902
1903     def __init__(self, downloader=None):
1904         InfoExtractor.__init__(self, downloader)
1905
1906     def report_download_page(self, username, pagenum):
1907         """Report attempt to download user page."""
1908         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1909                 (self.IE_NAME, username, pagenum))
1910
1911     def _real_extract(self, url):
1912         # Extract username
1913         mobj = re.match(self._VALID_URL, url)
1914         if mobj is None:
1915             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1916             return
1917
1918         username = mobj.group(1)
1919
1920         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1921
1922         request = compat_urllib_request.Request(url)
1923
1924         try:
1925             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926             mobj = re.search(r'data-users-id="([^"]+)"', page)
1927             page_base = page_base % mobj.group(1)
1928         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1930             return
1931
1932
1933         # Download video ids using BlipTV Ajax calls. Result size per
1934         # query is limited (currently to 12 videos) so we need to query
1935         # page by page until there are no video ids - it means we got
1936         # all of them.
1937
1938         video_ids = []
1939         pagenum = 1
1940
1941         while True:
1942             self.report_download_page(username, pagenum)
1943             url = page_base + "&page=" + str(pagenum)
1944             request = compat_urllib_request.Request( url )
1945             try:
1946                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1949                 return
1950
1951             # Extract video identifiers
1952             ids_in_page = []
1953
1954             for mobj in re.finditer(r'href="/([^"]+)"', page):
1955                 if mobj.group(1) not in ids_in_page:
1956                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1957
1958             video_ids.extend(ids_in_page)
1959
1960             # A little optimization - if current page is not
1961             # "full", ie. does not contain PAGE_SIZE video ids then
1962             # we can assume that this page is the last one - there
1963             # are no more ids on further pages - no need to query
1964             # again.
1965
1966             if len(ids_in_page) < self._PAGE_SIZE:
1967                 break
1968
1969             pagenum += 1
1970
1971         all_ids_count = len(video_ids)
1972         playliststart = self._downloader.params.get('playliststart', 1) - 1
1973         playlistend = self._downloader.params.get('playlistend', -1)
1974
1975         if playlistend == -1:
1976             video_ids = video_ids[playliststart:]
1977         else:
1978             video_ids = video_ids[playliststart:playlistend]
1979
1980         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1981                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982
1983         for video_id in video_ids:
1984             self._downloader.download([u'http://blip.tv/'+video_id])
1985
1986
1987 class DepositFilesIE(InfoExtractor):
1988     """Information extractor for depositfiles.com"""
1989
1990     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991
1992     def report_download_webpage(self, file_id):
1993         """Report webpage download."""
1994         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995
1996     def report_extraction(self, file_id):
1997         """Report information extraction."""
1998         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999
2000     def _real_extract(self, url):
2001         file_id = url.split('/')[-1]
2002         # Rebuild url in english locale
2003         url = 'http://depositfiles.com/en/files/' + file_id
2004
2005         # Retrieve file webpage with 'Free download' button pressed
2006         free_download_indication = { 'gateway_result' : '1' }
2007         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008         try:
2009             self.report_download_webpage(file_id)
2010             webpage = compat_urllib_request.urlopen(request).read()
2011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2013             return
2014
2015         # Search for the real file URL
2016         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2017         if (mobj is None) or (mobj.group(1) is None):
2018             # Try to figure out reason of the error.
2019             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2020             if (mobj is not None) and (mobj.group(1) is not None):
2021                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2022                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023             else:
2024                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2025             return
2026
2027         file_url = mobj.group(1)
2028         file_extension = os.path.splitext(file_url)[1][1:]
2029
2030         # Search for file title
2031         mobj = re.search(r'<b title="(.*?)">', webpage)
2032         if mobj is None:
2033             self._downloader.trouble(u'ERROR: unable to extract title')
2034             return
2035         file_title = mobj.group(1).decode('utf-8')
2036
2037         return [{
2038             'id':       file_id.decode('utf-8'),
2039             'url':      file_url.decode('utf-8'),
2040             'uploader': None,
2041             'upload_date':  None,
2042             'title':    file_title,
2043             'ext':      file_extension.decode('utf-8'),
2044         }]
2045
2046
2047 class FacebookIE(InfoExtractor):
2048     """Information Extractor for Facebook"""
2049
2050     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2051     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2052     _NETRC_MACHINE = 'facebook'
2053     IE_NAME = u'facebook'
2054
2055     def report_login(self):
2056         """Report attempt to log in."""
2057         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058
2059     def _real_initialize(self):
2060         if self._downloader is None:
2061             return
2062
2063         useremail = None
2064         password = None
2065         downloader_params = self._downloader.params
2066
2067         # Attempt to use provided username and password or .netrc data
2068         if downloader_params.get('username', None) is not None:
2069             useremail = downloader_params['username']
2070             password = downloader_params['password']
2071         elif downloader_params.get('usenetrc', False):
2072             try:
2073                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2074                 if info is not None:
2075                     useremail = info[0]
2076                     password = info[2]
2077                 else:
2078                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2079             except (IOError, netrc.NetrcParseError) as err:
2080                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2081                 return
2082
2083         if useremail is None:
2084             return
2085
2086         # Log in
2087         login_form = {
2088             'email': useremail,
2089             'pass': password,
2090             'login': 'Log+In'
2091             }
2092         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2093         try:
2094             self.report_login()
2095             login_results = compat_urllib_request.urlopen(request).read()
2096             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2097                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098                 return
2099         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2100             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2101             return
2102
2103     def _real_extract(self, url):
2104         mobj = re.match(self._VALID_URL, url)
2105         if mobj is None:
2106             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107             return
2108         video_id = mobj.group('ID')
2109
2110         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2111         webpage = self._download_webpage(url, video_id)
2112
2113         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2114         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2115         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116         if not m:
2117             raise ExtractorError(u'Cannot parse data')
2118         data = dict(json.loads(m.group(1)))
2119         params_raw = compat_urllib_parse.unquote(data['params'])
2120         params = json.loads(params_raw)
2121         video_url = params['hd_src']
2122         if not video_url:
2123             video_url = params['sd_src']
2124         if not video_url:
2125             raise ExtractorError(u'Cannot find video URL')
2126         video_duration = int(params['video_duration'])
2127
2128         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129         if not m:
2130             raise ExtractorError(u'Cannot find title in webpage')
2131         video_title = unescapeHTML(m.group(1))
2132
2133         info = {
2134             'id': video_id,
2135             'title': video_title,
2136             'url': video_url,
2137             'ext': 'mp4',
2138             'duration': video_duration,
2139             'thumbnail': params['thumbnail_src'],
2140         }
2141         return [info]
2142
2143
2144 class BlipTVIE(InfoExtractor):
2145     """Information extractor for blip.tv"""
2146
2147     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149     IE_NAME = u'blip.tv'
2150
2151     def report_extraction(self, file_id):
2152         """Report information extraction."""
2153         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155     def report_direct_download(self, title):
2156         """Report information extraction."""
2157         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159     def _real_extract(self, url):
2160         mobj = re.match(self._VALID_URL, url)
2161         if mobj is None:
2162             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163             return
2164
2165         if '?' in url:
2166             cchar = '&'
2167         else:
2168             cchar = '?'
2169         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2170         request = compat_urllib_request.Request(json_url)
2171         request.add_header('User-Agent', 'iTunes/10.6.1')
2172         self.report_extraction(mobj.group(1))
2173         info = None
2174         try:
2175             urlh = compat_urllib_request.urlopen(request)
2176             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2177                 basename = url.split('/')[-1]
2178                 title,ext = os.path.splitext(basename)
2179                 title = title.decode('UTF-8')
2180                 ext = ext.replace('.', '')
2181                 self.report_direct_download(title)
2182                 info = {
2183                     'id': title,
2184                     'url': url,
2185                     'uploader': None,
2186                     'upload_date': None,
2187                     'title': title,
2188                     'ext': ext,
2189                     'urlhandle': urlh
2190                 }
2191         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2192             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2193         if info is None: # Regular URL
2194             try:
2195                 json_code_bytes = urlh.read()
2196                 json_code = json_code_bytes.decode('utf-8')
2197             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2198                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2199                 return
2200
2201             try:
2202                 json_data = json.loads(json_code)
2203                 if 'Post' in json_data:
2204                     data = json_data['Post']
2205                 else:
2206                     data = json_data
2207
2208                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2209                 video_url = data['media']['url']
2210                 umobj = re.match(self._URL_EXT, video_url)
2211                 if umobj is None:
2212                     raise ValueError('Can not determine filename extension')
2213                 ext = umobj.group(1)
2214
2215                 info = {
2216                     'id': data['item_id'],
2217                     'url': video_url,
2218                     'uploader': data['display_name'],
2219                     'upload_date': upload_date,
2220                     'title': data['title'],
2221                     'ext': ext,
2222                     'format': data['media']['mimeType'],
2223                     'thumbnail': data['thumbnailUrl'],
2224                     'description': data['description'],
2225                     'player_url': data['embedUrl'],
2226                     'user_agent': 'iTunes/10.6.1',
2227                 }
2228             except (ValueError,KeyError) as err:
2229                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2230                 return
2231
2232         return [info]
2233
2234
2235 class MyVideoIE(InfoExtractor):
2236     """Information Extractor for myvideo.de."""
2237
2238     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2239     IE_NAME = u'myvideo'
2240
2241     def __init__(self, downloader=None):
2242         InfoExtractor.__init__(self, downloader)
2243
2244     def report_extraction(self, video_id):
2245         """Report information extraction."""
2246         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2247
2248     def _real_extract(self,url):
2249         mobj = re.match(self._VALID_URL, url)
2250         if mobj is None:
2251             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2252             return
2253
2254         video_id = mobj.group(1)
2255
2256         # Get video webpage
2257         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2258         webpage = self._download_webpage(webpage_url, video_id)
2259
2260         self.report_extraction(video_id)
2261         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2262                  webpage)
2263         if mobj is None:
2264             self._downloader.trouble(u'ERROR: unable to extract media URL')
2265             return
2266         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2267
2268         mobj = re.search('<title>([^<]+)</title>', webpage)
2269         if mobj is None:
2270             self._downloader.trouble(u'ERROR: unable to extract title')
2271             return
2272
2273         video_title = mobj.group(1)
2274
2275         return [{
2276             'id':       video_id,
2277             'url':      video_url,
2278             'uploader': None,
2279             'upload_date':  None,
2280             'title':    video_title,
2281             'ext':      u'flv',
2282         }]
2283
2284 class ComedyCentralIE(InfoExtractor):
2285     """Information extractor for The Daily Show and Colbert Report """
2286
2287     # urls can be abbreviations like :thedailyshow or :colbert
2288     # urls for episodes like:
2289     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2290     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2291     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2292     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2293                       |(https?://)?(www\.)?
2294                           (?P<showname>thedailyshow|colbertnation)\.com/
2295                          (full-episodes/(?P<episode>.*)|
2296                           (?P<clip>
2297                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2298                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2299                      $"""
2300
2301     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2302
2303     _video_extensions = {
2304         '3500': 'mp4',
2305         '2200': 'mp4',
2306         '1700': 'mp4',
2307         '1200': 'mp4',
2308         '750': 'mp4',
2309         '400': 'mp4',
2310     }
2311     _video_dimensions = {
2312         '3500': '1280x720',
2313         '2200': '960x540',
2314         '1700': '768x432',
2315         '1200': '640x360',
2316         '750': '512x288',
2317         '400': '384x216',
2318     }
2319
2320     @classmethod
2321     def suitable(cls, url):
2322         """Receives a URL and returns True if suitable for this IE."""
2323         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2324
2325     def report_extraction(self, episode_id):
2326         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2327
2328     def report_config_download(self, episode_id, media_id):
2329         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2330
2331     def report_index_download(self, episode_id):
2332         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2333
2334     def _print_formats(self, formats):
2335         print('Available formats:')
2336         for x in formats:
2337             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2338
2339
2340     def _real_extract(self, url):
2341         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2342         if mobj is None:
2343             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2344             return
2345
2346         if mobj.group('shortname'):
2347             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2348                 url = u'http://www.thedailyshow.com/full-episodes/'
2349             else:
2350                 url = u'http://www.colbertnation.com/full-episodes/'
2351             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352             assert mobj is not None
2353
2354         if mobj.group('clip'):
2355             if mobj.group('showname') == 'thedailyshow':
2356                 epTitle = mobj.group('tdstitle')
2357             else:
2358                 epTitle = mobj.group('cntitle')
2359             dlNewest = False
2360         else:
2361             dlNewest = not mobj.group('episode')
2362             if dlNewest:
2363                 epTitle = mobj.group('showname')
2364             else:
2365                 epTitle = mobj.group('episode')
2366
2367         req = compat_urllib_request.Request(url)
2368         self.report_extraction(epTitle)
2369         try:
2370             htmlHandle = compat_urllib_request.urlopen(req)
2371             html = htmlHandle.read()
2372             webpage = html.decode('utf-8')
2373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2374             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2375             return
2376         if dlNewest:
2377             url = htmlHandle.geturl()
2378             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2379             if mobj is None:
2380                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2381                 return
2382             if mobj.group('episode') == '':
2383                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2384                 return
2385             epTitle = mobj.group('episode')
2386
2387         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2388
2389         if len(mMovieParams) == 0:
2390             # The Colbert Report embeds the information in a without
2391             # a URL prefix; so extract the alternate reference
2392             # and then add the URL prefix manually.
2393
2394             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2395             if len(altMovieParams) == 0:
2396                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2397                 return
2398             else:
2399                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2400
2401         uri = mMovieParams[0][1]
2402         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2403         self.report_index_download(epTitle)
2404         try:
2405             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2406         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2407             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2408             return
2409
2410         results = []
2411
2412         idoc = xml.etree.ElementTree.fromstring(indexXml)
2413         itemEls = idoc.findall('.//item')
2414         for partNum,itemEl in enumerate(itemEls):
2415             mediaId = itemEl.findall('./guid')[0].text
2416             shortMediaId = mediaId.split(':')[-1]
2417             showId = mediaId.split(':')[-2].replace('.com', '')
2418             officialTitle = itemEl.findall('./title')[0].text
2419             officialDate = itemEl.findall('./pubDate')[0].text
2420
2421             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2422                         compat_urllib_parse.urlencode({'uri': mediaId}))
2423             configReq = compat_urllib_request.Request(configUrl)
2424             self.report_config_download(epTitle, shortMediaId)
2425             try:
2426                 configXml = compat_urllib_request.urlopen(configReq).read()
2427             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2429                 return
2430
2431             cdoc = xml.etree.ElementTree.fromstring(configXml)
2432             turls = []
2433             for rendition in cdoc.findall('.//rendition'):
2434                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2435                 turls.append(finfo)
2436
2437             if len(turls) == 0:
2438                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2439                 continue
2440
2441             if self._downloader.params.get('listformats', None):
2442                 self._print_formats([i[0] for i in turls])
2443                 return
2444
2445             # For now, just pick the highest bitrate
2446             format,rtmp_video_url = turls[-1]
2447
2448             # Get the format arg from the arg stream
2449             req_format = self._downloader.params.get('format', None)
2450
2451             # Select format if we can find one
2452             for f,v in turls:
2453                 if f == req_format:
2454                     format, rtmp_video_url = f, v
2455                     break
2456
2457             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2458             if not m:
2459                 raise ExtractorError(u'Cannot transform RTMP url')
2460             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2461             video_url = base + m.group('finalid')
2462
2463             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2464             info = {
2465                 'id': shortMediaId,
2466                 'url': video_url,
2467                 'uploader': showId,
2468                 'upload_date': officialDate,
2469                 'title': effTitle,
2470                 'ext': 'mp4',
2471                 'format': format,
2472                 'thumbnail': None,
2473                 'description': officialTitle,
2474             }
2475             results.append(info)
2476
2477         return results
2478
2479
2480 class EscapistIE(InfoExtractor):
2481     """Information extractor for The Escapist """
2482
2483     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2484     IE_NAME = u'escapist'
2485
2486     def report_extraction(self, showName):
2487         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2488
2489     def report_config_download(self, showName):
2490         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2491
2492     def _real_extract(self, url):
2493         mobj = re.match(self._VALID_URL, url)
2494         if mobj is None:
2495             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496             return
2497         showName = mobj.group('showname')
2498         videoId = mobj.group('episode')
2499
2500         self.report_extraction(showName)
2501         try:
2502             webPage = compat_urllib_request.urlopen(url)
2503             webPageBytes = webPage.read()
2504             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2505             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2506         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2507             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2508             return
2509
2510         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2511         description = unescapeHTML(descMatch.group(1))
2512         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2513         imgUrl = unescapeHTML(imgMatch.group(1))
2514         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2515         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2516         configUrlMatch = re.search('config=(.*)$', playerUrl)
2517         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2518
2519         self.report_config_download(showName)
2520         try:
2521             configJSON = compat_urllib_request.urlopen(configUrl)
2522             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2523             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2524         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2526             return
2527
2528         # Technically, it's JavaScript, not JSON
2529         configJSON = configJSON.replace("'", '"')
2530
2531         try:
2532             config = json.loads(configJSON)
2533         except (ValueError,) as err:
2534             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2535             return
2536
2537         playlist = config['playlist']
2538         videoUrl = playlist[1]['url']
2539
2540         info = {
2541             'id': videoId,
2542             'url': videoUrl,
2543             'uploader': showName,
2544             'upload_date': None,
2545             'title': showName,
2546             'ext': 'flv',
2547             'thumbnail': imgUrl,
2548             'description': description,
2549             'player_url': playerUrl,
2550         }
2551
2552         return [info]
2553
2554 class CollegeHumorIE(InfoExtractor):
2555     """Information extractor for collegehumor.com"""
2556
2557     _WORKING = False
2558     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2559     IE_NAME = u'collegehumor'
2560
2561     def report_manifest(self, video_id):
2562         """Report information extraction."""
2563         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2564
2565     def report_extraction(self, video_id):
2566         """Report information extraction."""
2567         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2568
2569     def _real_extract(self, url):
2570         mobj = re.match(self._VALID_URL, url)
2571         if mobj is None:
2572             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2573             return
2574         video_id = mobj.group('videoid')
2575
2576         info = {
2577             'id': video_id,
2578             'uploader': None,
2579             'upload_date': None,
2580         }
2581
2582         self.report_extraction(video_id)
2583         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2584         try:
2585             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2588             return
2589
2590         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2591         try:
2592             videoNode = mdoc.findall('./video')[0]
2593             info['description'] = videoNode.findall('./description')[0].text
2594             info['title'] = videoNode.findall('./caption')[0].text
2595             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2596             manifest_url = videoNode.findall('./file')[0].text
2597         except IndexError:
2598             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2599             return
2600
2601         manifest_url += '?hdcore=2.10.3'
2602         self.report_manifest(video_id)
2603         try:
2604             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2605         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2606             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2607             return
2608
2609         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2610         try:
2611             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2612             node_id = media_node.attrib['url']
2613             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2614         except IndexError as err:
2615             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2616             return
2617
2618         url_pr = compat_urllib_parse_urlparse(manifest_url)
2619         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2620
2621         info['url'] = url
2622         info['ext'] = 'f4f'
2623         return [info]
2624
2625
2626 class XVideosIE(InfoExtractor):
2627     """Information extractor for xvideos.com"""
2628
2629     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2630     IE_NAME = u'xvideos'
2631
2632     def report_extraction(self, video_id):
2633         """Report information extraction."""
2634         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2635
2636     def _real_extract(self, url):
2637         mobj = re.match(self._VALID_URL, url)
2638         if mobj is None:
2639             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2640             return
2641         video_id = mobj.group(1)
2642
2643         webpage = self._download_webpage(url, video_id)
2644
2645         self.report_extraction(video_id)
2646
2647
2648         # Extract video URL
2649         mobj = re.search(r'flv_url=(.+?)&', webpage)
2650         if mobj is None:
2651             self._downloader.trouble(u'ERROR: unable to extract video url')
2652             return
2653         video_url = compat_urllib_parse.unquote(mobj.group(1))
2654
2655
2656         # Extract title
2657         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2658         if mobj is None:
2659             self._downloader.trouble(u'ERROR: unable to extract video title')
2660             return
2661         video_title = mobj.group(1)
2662
2663
2664         # Extract video thumbnail
2665         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2666         if mobj is None:
2667             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2668             return
2669         video_thumbnail = mobj.group(0)
2670
2671         info = {
2672             'id': video_id,
2673             'url': video_url,
2674             'uploader': None,
2675             'upload_date': None,
2676             'title': video_title,
2677             'ext': 'flv',
2678             'thumbnail': video_thumbnail,
2679             'description': None,
2680         }
2681
2682         return [info]
2683
2684
2685 class SoundcloudIE(InfoExtractor):
2686     """Information extractor for soundcloud.com
2687        To access the media, the uid of the song and a stream token
2688        must be extracted from the page source and the script must make
2689        a request to media.soundcloud.com/crossdomain.xml. Then
2690        the media can be grabbed by requesting from an url composed
2691        of the stream token and uid
2692      """
2693
2694     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2695     IE_NAME = u'soundcloud'
2696
2697     def __init__(self, downloader=None):
2698         InfoExtractor.__init__(self, downloader)
2699
2700     def report_resolve(self, video_id):
2701         """Report information extraction."""
2702         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2703
2704     def report_extraction(self, video_id):
2705         """Report information extraction."""
2706         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2712             return
2713
2714         # extract uploader (which is in the url)
2715         uploader = mobj.group(1)
2716         # extract simple title (uploader + slug of song title)
2717         slug_title =  mobj.group(2)
2718         simple_title = uploader + u'-' + slug_title
2719
2720         self.report_resolve('%s/%s' % (uploader, slug_title))
2721
2722         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2723         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2724         request = compat_urllib_request.Request(resolv_url)
2725         try:
2726             info_json_bytes = compat_urllib_request.urlopen(request).read()
2727             info_json = info_json_bytes.decode('utf-8')
2728         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2729             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2730             return
2731
2732         info = json.loads(info_json)
2733         video_id = info['id']
2734         self.report_extraction('%s/%s' % (uploader, slug_title))
2735
2736         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737         request = compat_urllib_request.Request(streams_url)
2738         try:
2739             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2740             stream_json = stream_json_bytes.decode('utf-8')
2741         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2743             return
2744
2745         streams = json.loads(stream_json)
2746         mediaURL = streams['http_mp3_128_url']
2747
2748         return [{
2749             'id':       info['id'],
2750             'url':      mediaURL,
2751             'uploader': info['user']['username'],
2752             'upload_date':  info['created_at'],
2753             'title':    info['title'],
2754             'ext':      u'mp3',
2755             'description': info['description'],
2756         }]
2757
2758
2759 class InfoQIE(InfoExtractor):
2760     """Information extractor for infoq.com"""
2761     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2762
2763     def report_extraction(self, video_id):
2764         """Report information extraction."""
2765         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2766
2767     def _real_extract(self, url):
2768         mobj = re.match(self._VALID_URL, url)
2769         if mobj is None:
2770             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2771             return
2772
2773         webpage = self._download_webpage(url, video_id=url)
2774         self.report_extraction(url)
2775
2776         # Extract video URL
2777         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2778         if mobj is None:
2779             self._downloader.trouble(u'ERROR: unable to extract video url')
2780             return
2781         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2782         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2783
2784         # Extract title
2785         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2786         if mobj is None:
2787             self._downloader.trouble(u'ERROR: unable to extract video title')
2788             return
2789         video_title = mobj.group(1)
2790
2791         # Extract description
2792         video_description = u'No description available.'
2793         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2794         if mobj is not None:
2795             video_description = mobj.group(1)
2796
2797         video_filename = video_url.split('/')[-1]
2798         video_id, extension = video_filename.split('.')
2799
2800         info = {
2801             'id': video_id,
2802             'url': video_url,
2803             'uploader': None,
2804             'upload_date': None,
2805             'title': video_title,
2806             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2807             'thumbnail': None,
2808             'description': video_description,
2809         }
2810
2811         return [info]
2812
2813 class MixcloudIE(InfoExtractor):
2814     """Information extractor for www.mixcloud.com"""
2815
2816     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2817     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2818     IE_NAME = u'mixcloud'
2819
2820     def __init__(self, downloader=None):
2821         InfoExtractor.__init__(self, downloader)
2822
2823     def report_download_json(self, file_id):
2824         """Report JSON download."""
2825         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2826
2827     def report_extraction(self, file_id):
2828         """Report information extraction."""
2829         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2830
2831     def get_urls(self, jsonData, fmt, bitrate='best'):
2832         """Get urls from 'audio_formats' section in json"""
2833         file_url = None
2834         try:
2835             bitrate_list = jsonData[fmt]
2836             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2837                 bitrate = max(bitrate_list) # select highest
2838
2839             url_list = jsonData[fmt][bitrate]
2840         except TypeError: # we have no bitrate info.
2841             url_list = jsonData[fmt]
2842         return url_list
2843
2844     def check_urls(self, url_list):
2845         """Returns 1st active url from list"""
2846         for url in url_list:
2847             try:
2848                 compat_urllib_request.urlopen(url)
2849                 return url
2850             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2851                 url = None
2852
2853         return None
2854
2855     def _print_formats(self, formats):
2856         print('Available formats:')
2857         for fmt in formats.keys():
2858             for b in formats[fmt]:
2859                 try:
2860                     ext = formats[fmt][b][0]
2861                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2862                 except TypeError: # we have no bitrate info
2863                     ext = formats[fmt][0]
2864                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2865                     break
2866
2867     def _real_extract(self, url):
2868         mobj = re.match(self._VALID_URL, url)
2869         if mobj is None:
2870             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2871             return
2872         # extract uploader & filename from url
2873         uploader = mobj.group(1).decode('utf-8')
2874         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2875
2876         # construct API request
2877         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2878         # retrieve .json file with links to files
2879         request = compat_urllib_request.Request(file_url)
2880         try:
2881             self.report_download_json(file_url)
2882             jsonData = compat_urllib_request.urlopen(request).read()
2883         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2885             return
2886
2887         # parse JSON
2888         json_data = json.loads(jsonData)
2889         player_url = json_data['player_swf_url']
2890         formats = dict(json_data['audio_formats'])
2891
2892         req_format = self._downloader.params.get('format', None)
2893         bitrate = None
2894
2895         if self._downloader.params.get('listformats', None):
2896             self._print_formats(formats)
2897             return
2898
2899         if req_format is None or req_format == 'best':
2900             for format_param in formats.keys():
2901                 url_list = self.get_urls(formats, format_param)
2902                 # check urls
2903                 file_url = self.check_urls(url_list)
2904                 if file_url is not None:
2905                     break # got it!
2906         else:
2907             if req_format not in formats:
2908                 self._downloader.trouble(u'ERROR: format is not available')
2909                 return
2910
2911             url_list = self.get_urls(formats, req_format)
2912             file_url = self.check_urls(url_list)
2913             format_param = req_format
2914
2915         return [{
2916             'id': file_id.decode('utf-8'),
2917             'url': file_url.decode('utf-8'),
2918             'uploader': uploader.decode('utf-8'),
2919             'upload_date': None,
2920             'title': json_data['name'],
2921             'ext': file_url.split('.')[-1].decode('utf-8'),
2922             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2923             'thumbnail': json_data['thumbnail_url'],
2924             'description': json_data['description'],
2925             'player_url': player_url.decode('utf-8'),
2926         }]
2927
2928 class StanfordOpenClassroomIE(InfoExtractor):
2929     """Information extractor for Stanford's Open ClassRoom"""
2930
2931     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2932     IE_NAME = u'stanfordoc'
2933
2934     def report_download_webpage(self, objid):
2935         """Report information extraction."""
2936         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2937
2938     def report_extraction(self, video_id):
2939         """Report information extraction."""
2940         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2941
2942     def _real_extract(self, url):
2943         mobj = re.match(self._VALID_URL, url)
2944         if mobj is None:
2945             raise ExtractorError(u'Invalid URL: %s' % url)
2946
2947         if mobj.group('course') and mobj.group('video'): # A specific video
2948             course = mobj.group('course')
2949             video = mobj.group('video')
2950             info = {
2951                 'id': course + '_' + video,
2952                 'uploader': None,
2953                 'upload_date': None,
2954             }
2955
2956             self.report_extraction(info['id'])
2957             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2958             xmlUrl = baseUrl + video + '.xml'
2959             try:
2960                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2961             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2962                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2963                 return
2964             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2965             try:
2966                 info['title'] = mdoc.findall('./title')[0].text
2967                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2968             except IndexError:
2969                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2970                 return
2971             info['ext'] = info['url'].rpartition('.')[2]
2972             return [info]
2973         elif mobj.group('course'): # A course page
2974             course = mobj.group('course')
2975             info = {
2976                 'id': course,
2977                 'type': 'playlist',
2978                 'uploader': None,
2979                 'upload_date': None,
2980             }
2981
2982             coursepage = self._download_webpage(url, info['id'],
2983                                         note='Downloading course info page',
2984                                         errnote='Unable to download course info page')
2985
2986             m = re.search('<h1>([^<]+)</h1>', coursepage)
2987             if m:
2988                 info['title'] = unescapeHTML(m.group(1))
2989             else:
2990                 info['title'] = info['id']
2991
2992             m = re.search('<description>([^<]+)</description>', coursepage)
2993             if m:
2994                 info['description'] = unescapeHTML(m.group(1))
2995
2996             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2997             info['list'] = [
2998                 {
2999                     'type': 'reference',
3000                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3001                 }
3002                     for vpage in links]
3003             results = []
3004             for entry in info['list']:
3005                 assert entry['type'] == 'reference'
3006                 results += self.extract(entry['url'])
3007             return results
3008         else: # Root page
3009             info = {
3010                 'id': 'Stanford OpenClassroom',
3011                 'type': 'playlist',
3012                 'uploader': None,
3013                 'upload_date': None,
3014             }
3015
3016             self.report_download_webpage(info['id'])
3017             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3018             try:
3019                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3020             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3022                 return
3023
3024             info['title'] = info['id']
3025
3026             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3027             info['list'] = [
3028                 {
3029                     'type': 'reference',
3030                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3031                 }
3032                     for cpage in links]
3033
3034             results = []
3035             for entry in info['list']:
3036                 assert entry['type'] == 'reference'
3037                 results += self.extract(entry['url'])
3038             return results
3039
3040 class MTVIE(InfoExtractor):
3041     """Information extractor for MTV.com"""
3042
3043     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3044     IE_NAME = u'mtv'
3045
3046     def report_extraction(self, video_id):
3047         """Report information extraction."""
3048         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3049
3050     def _real_extract(self, url):
3051         mobj = re.match(self._VALID_URL, url)
3052         if mobj is None:
3053             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3054             return
3055         if not mobj.group('proto'):
3056             url = 'http://' + url
3057         video_id = mobj.group('videoid')
3058
3059         webpage = self._download_webpage(url, video_id)
3060
3061         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062         if mobj is None:
3063             self._downloader.trouble(u'ERROR: unable to extract song name')
3064             return
3065         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067         if mobj is None:
3068             self._downloader.trouble(u'ERROR: unable to extract performer')
3069             return
3070         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071         video_title = performer + ' - ' + song_name
3072
3073         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074         if mobj is None:
3075             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3076             return
3077         mtvn_uri = mobj.group(1)
3078
3079         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080         if mobj is None:
3081             self._downloader.trouble(u'ERROR: unable to extract content id')
3082             return
3083         content_id = mobj.group(1)
3084
3085         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086         self.report_extraction(video_id)
3087         request = compat_urllib_request.Request(videogen_url)
3088         try:
3089             metadataXml = compat_urllib_request.urlopen(request).read()
3090         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3092             return
3093
3094         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095         renditions = mdoc.findall('.//rendition')
3096
3097         # For now, always pick the highest quality.
3098         rendition = renditions[-1]
3099
3100         try:
3101             _,_,ext = rendition.attrib['type'].partition('/')
3102             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103             video_url = rendition.find('./src').text
3104         except KeyError:
3105             self._downloader.trouble('Invalid rendition field.')
3106             return
3107
3108         info = {
3109             'id': video_id,
3110             'url': video_url,
3111             'uploader': performer,
3112             'upload_date': None,
3113             'title': video_title,
3114             'ext': ext,
3115             'format': format,
3116         }
3117
3118         return [info]
3119
3120
3121 class YoukuIE(InfoExtractor):
3122     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123
3124     def report_download_webpage(self, file_id):
3125         """Report webpage download."""
3126         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3127
3128     def report_extraction(self, file_id):
3129         """Report information extraction."""
3130         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3131
3132     def _gen_sid(self):
3133         nowTime = int(time.time() * 1000)
3134         random1 = random.randint(1000,1998)
3135         random2 = random.randint(1000,9999)
3136
3137         return "%d%d%d" %(nowTime,random1,random2)
3138
3139     def _get_file_ID_mix_string(self, seed):
3140         mixed = []
3141         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3142         seed = float(seed)
3143         for i in range(len(source)):
3144             seed  =  (seed * 211 + 30031 ) % 65536
3145             index  =  math.floor(seed / 65536 * len(source) )
3146             mixed.append(source[int(index)])
3147             source.remove(source[int(index)])
3148         #return ''.join(mixed)
3149         return mixed
3150
3151     def _get_file_id(self, fileId, seed):
3152         mixed = self._get_file_ID_mix_string(seed)
3153         ids = fileId.split('*')
3154         realId = []
3155         for ch in ids:
3156             if ch:
3157                 realId.append(mixed[int(ch)])
3158         return ''.join(realId)
3159
3160     def _real_extract(self, url):
3161         mobj = re.match(self._VALID_URL, url)
3162         if mobj is None:
3163             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3164             return
3165         video_id = mobj.group('ID')
3166
3167         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3168
3169         request = compat_urllib_request.Request(info_url, None, std_headers)
3170         try:
3171             self.report_download_webpage(video_id)
3172             jsondata = compat_urllib_request.urlopen(request).read()
3173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3175             return
3176
3177         self.report_extraction(video_id)
3178         try:
3179             jsonstr = jsondata.decode('utf-8')
3180             config = json.loads(jsonstr)
3181
3182             video_title =  config['data'][0]['title']
3183             seed = config['data'][0]['seed']
3184
3185             format = self._downloader.params.get('format', None)
3186             supported_format = list(config['data'][0]['streamfileids'].keys())
3187
3188             if format is None or format == 'best':
3189                 if 'hd2' in supported_format:
3190                     format = 'hd2'
3191                 else:
3192                     format = 'flv'
3193                 ext = u'flv'
3194             elif format == 'worst':
3195                 format = 'mp4'
3196                 ext = u'mp4'
3197             else:
3198                 format = 'flv'
3199                 ext = u'flv'
3200
3201
3202             fileid = config['data'][0]['streamfileids'][format]
3203             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3204         except (UnicodeDecodeError, ValueError, KeyError):
3205             self._downloader.trouble(u'ERROR: unable to extract info section')
3206             return
3207
3208         files_info=[]
3209         sid = self._gen_sid()
3210         fileid = self._get_file_id(fileid, seed)
3211
3212         #column 8,9 of fileid represent the segment number
3213         #fileid[7:9] should be changed
3214         for index, key in enumerate(keys):
3215
3216             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3217             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3218
3219             info = {
3220                 'id': '%s_part%02d' % (video_id, index),
3221                 'url': download_url,
3222                 'uploader': None,
3223                 'upload_date': None,
3224                 'title': video_title,
3225                 'ext': ext,
3226             }
3227             files_info.append(info)
3228
3229         return files_info
3230
3231
3232 class XNXXIE(InfoExtractor):
3233     """Information extractor for xnxx.com"""
3234
3235     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3236     IE_NAME = u'xnxx'
3237     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3238     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3239     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3240
3241     def report_webpage(self, video_id):
3242         """Report information extraction"""
3243         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3244
3245     def report_extraction(self, video_id):
3246         """Report information extraction"""
3247         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3248
3249     def _real_extract(self, url):
3250         mobj = re.match(self._VALID_URL, url)
3251         if mobj is None:
3252             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3253             return
3254         video_id = mobj.group(1)
3255
3256         self.report_webpage(video_id)
3257
3258         # Get webpage content
3259         try:
3260             webpage_bytes = compat_urllib_request.urlopen(url).read()
3261             webpage = webpage_bytes.decode('utf-8')
3262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3264             return
3265
3266         result = re.search(self.VIDEO_URL_RE, webpage)
3267         if result is None:
3268             self._downloader.trouble(u'ERROR: unable to extract video url')
3269             return
3270         video_url = compat_urllib_parse.unquote(result.group(1))
3271
3272         result = re.search(self.VIDEO_TITLE_RE, webpage)
3273         if result is None:
3274             self._downloader.trouble(u'ERROR: unable to extract video title')
3275             return
3276         video_title = result.group(1)
3277
3278         result = re.search(self.VIDEO_THUMB_RE, webpage)
3279         if result is None:
3280             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3281             return
3282         video_thumbnail = result.group(1)
3283
3284         return [{
3285             'id': video_id,
3286             'url': video_url,
3287             'uploader': None,
3288             'upload_date': None,
3289             'title': video_title,
3290             'ext': 'flv',
3291             'thumbnail': video_thumbnail,
3292             'description': None,
3293         }]
3294
3295
3296 class GooglePlusIE(InfoExtractor):
3297     """Information extractor for plus.google.com."""
3298
3299     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3300     IE_NAME = u'plus.google'
3301
3302     def __init__(self, downloader=None):
3303         InfoExtractor.__init__(self, downloader)
3304
3305     def report_extract_entry(self, url):
3306         """Report downloading extry"""
3307         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3308
3309     def report_date(self, upload_date):
3310         """Report downloading extry"""
3311         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3312
3313     def report_uploader(self, uploader):
3314         """Report downloading extry"""
3315         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3316
3317     def report_title(self, video_title):
3318         """Report downloading extry"""
3319         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3320
3321     def report_extract_vid_page(self, video_page):
3322         """Report information extraction."""
3323         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3324
3325     def _real_extract(self, url):
3326         # Extract id from URL
3327         mobj = re.match(self._VALID_URL, url)
3328         if mobj is None:
3329             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3330             return
3331
3332         post_url = mobj.group(0)
3333         video_id = mobj.group(1)
3334
3335         video_extension = 'flv'
3336
3337         # Step 1, Retrieve post webpage to extract further information
3338         self.report_extract_entry(post_url)
3339         request = compat_urllib_request.Request(post_url)
3340         try:
3341             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3343             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3344             return
3345
3346         # Extract update date
3347         upload_date = None
3348         pattern = 'title="Timestamp">(.*?)</a>'
3349         mobj = re.search(pattern, webpage)
3350         if mobj:
3351             upload_date = mobj.group(1)
3352             # Convert timestring to a format suitable for filename
3353             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3354             upload_date = upload_date.strftime('%Y%m%d')
3355         self.report_date(upload_date)
3356
3357         # Extract uploader
3358         uploader = None
3359         pattern = r'rel\="author".*?>(.*?)</a>'
3360         mobj = re.search(pattern, webpage)
3361         if mobj:
3362             uploader = mobj.group(1)
3363         self.report_uploader(uploader)
3364
3365         # Extract title
3366         # Get the first line for title
3367         video_title = u'NA'
3368         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3369         mobj = re.search(pattern, webpage)
3370         if mobj:
3371             video_title = mobj.group(1)
3372         self.report_title(video_title)
3373
3374         # Step 2, Stimulate clicking the image box to launch video
3375         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3376         mobj = re.search(pattern, webpage)
3377         if mobj is None:
3378             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3379
3380         video_page = mobj.group(1)
3381         request = compat_urllib_request.Request(video_page)
3382         try:
3383             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3384         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3385             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3386             return
3387         self.report_extract_vid_page(video_page)
3388
3389
3390         # Extract video links on video page
3391         """Extract video links of all sizes"""
3392         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3393         mobj = re.findall(pattern, webpage)
3394         if len(mobj) == 0:
3395             self._downloader.trouble(u'ERROR: unable to extract video links')
3396
3397         # Sort in resolution
3398         links = sorted(mobj)
3399
3400         # Choose the lowest of the sort, i.e. highest resolution
3401         video_url = links[-1]
3402         # Only get the url. The resolution part in the tuple has no use anymore
3403         video_url = video_url[-1]
3404         # Treat escaped \u0026 style hex
3405         try:
3406             video_url = video_url.decode("unicode_escape")
3407         except AttributeError: # Python 3
3408             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3409
3410
3411         return [{
3412             'id':       video_id,
3413             'url':      video_url,
3414             'uploader': uploader,
3415             'upload_date':  upload_date,
3416             'title':    video_title,
3417             'ext':      video_extension,
3418         }]
3419
3420 class NBAIE(InfoExtractor):
3421     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3422     IE_NAME = u'nba'
3423
3424     def _real_extract(self, url):
3425         mobj = re.match(self._VALID_URL, url)
3426         if mobj is None:
3427             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428             return
3429
3430         video_id = mobj.group(1)
3431         if video_id.endswith('/index.html'):
3432             video_id = video_id[:-len('/index.html')]
3433
3434         webpage = self._download_webpage(url, video_id)
3435
3436         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3437         def _findProp(rexp, default=None):
3438             m = re.search(rexp, webpage)
3439             if m:
3440                 return unescapeHTML(m.group(1))
3441             else:
3442                 return default
3443
3444         shortened_video_id = video_id.rpartition('/')[2]
3445         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3446         info = {
3447             'id': shortened_video_id,
3448             'url': video_url,
3449             'ext': 'mp4',
3450             'title': title,
3451             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3452             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3453         }
3454         return [info]
3455
3456 class JustinTVIE(InfoExtractor):
3457     """Information extractor for justin.tv and twitch.tv"""
3458     # TODO: One broadcast may be split into multiple videos. The key
3459     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3460     # starts at 1 and increases. Can we treat all parts as one video?
3461
3462     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3463         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3464     _JUSTIN_PAGE_LIMIT = 100
3465     IE_NAME = u'justin.tv'
3466
3467     def report_extraction(self, file_id):
3468         """Report information extraction."""
3469         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3470
3471     def report_download_page(self, channel, offset):
3472         """Report attempt to download a single page of videos."""
3473         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3474                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3475
3476     # Return count of items, list of *valid* items
3477     def _parse_page(self, url):
3478         try:
3479             urlh = compat_urllib_request.urlopen(url)
3480             webpage_bytes = urlh.read()
3481             webpage = webpage_bytes.decode('utf-8', 'ignore')
3482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3484             return
3485
3486         response = json.loads(webpage)
3487         if type(response) != list:
3488             error_text = response.get('error', 'unknown error')
3489             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3490             return
3491         info = []
3492         for clip in response:
3493             video_url = clip['video_file_url']
3494             if video_url:
3495                 video_extension = os.path.splitext(video_url)[1][1:]
3496                 video_date = re.sub('-', '', clip['start_time'][:10])
3497                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3498                 video_id = clip['id']
3499                 video_title = clip.get('title', video_id)
3500                 info.append({
3501                     'id': video_id,
3502                     'url': video_url,
3503                     'title': video_title,
3504                     'uploader': clip.get('channel_name', video_uploader_id),
3505                     'uploader_id': video_uploader_id,
3506                     'upload_date': video_date,
3507                     'ext': video_extension,
3508                 })
3509         return (len(response), info)
3510
3511     def _real_extract(self, url):
3512         mobj = re.match(self._VALID_URL, url)
3513         if mobj is None:
3514             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3515             return
3516
3517         api = 'http://api.justin.tv'
3518         video_id = mobj.group(mobj.lastindex)
3519         paged = False
3520         if mobj.lastindex == 1:
3521             paged = True
3522             api += '/channel/archives/%s.json'
3523         else:
3524             api += '/broadcast/by_archive/%s.json'
3525         api = api % (video_id,)
3526
3527         self.report_extraction(video_id)
3528
3529         info = []
3530         offset = 0
3531         limit = self._JUSTIN_PAGE_LIMIT
3532         while True:
3533             if paged:
3534                 self.report_download_page(video_id, offset)
3535             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3536             page_count, page_info = self._parse_page(page_url)
3537             info.extend(page_info)
3538             if not paged or page_count != limit:
3539                 break
3540             offset += limit
3541         return info
3542
3543 class FunnyOrDieIE(InfoExtractor):
3544     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3545
3546     def _real_extract(self, url):
3547         mobj = re.match(self._VALID_URL, url)
3548         if mobj is None:
3549             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3550             return
3551
3552         video_id = mobj.group('id')
3553         webpage = self._download_webpage(url, video_id)
3554
3555         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3556         if not m:
3557             self._downloader.trouble(u'ERROR: unable to find video information')
3558         video_url = unescapeHTML(m.group('url'))
3559
3560         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3561         if not m:
3562             self._downloader.trouble(u'Cannot find video title')
3563         title = unescapeHTML(m.group('title'))
3564
3565         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3566         if m:
3567             desc = unescapeHTML(m.group('desc'))
3568         else:
3569             desc = None
3570
3571         info = {
3572             'id': video_id,
3573             'url': video_url,
3574             'ext': 'mp4',
3575             'title': title,
3576             'description': desc,
3577         }
3578         return [info]
3579
3580 class TweetReelIE(InfoExtractor):
3581     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3582
3583     def _real_extract(self, url):
3584         mobj = re.match(self._VALID_URL, url)
3585         if mobj is None:
3586             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3587             return
3588
3589         video_id = mobj.group('id')
3590         webpage = self._download_webpage(url, video_id)
3591
3592         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3593         if not m:
3594             self._downloader.trouble(u'ERROR: Cannot find status ID')
3595         status_id = m.group(1)
3596
3597         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3598         if not m:
3599             self._downloader.trouble(u'WARNING: Cannot find description')
3600         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3601
3602         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3603         if not m:
3604             self._downloader.trouble(u'ERROR: Cannot find uploader')
3605         uploader = unescapeHTML(m.group('uploader'))
3606         uploader_id = unescapeHTML(m.group('uploader_id'))
3607
3608         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3609         if not m:
3610             self._downloader.trouble(u'ERROR: Cannot find upload date')
3611         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3612
3613         title = desc
3614         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3615
3616         info = {
3617             'id': video_id,
3618             'url': video_url,
3619             'ext': 'mov',
3620             'title': title,
3621             'description': desc,
3622             'uploader': uploader,
3623             'uploader_id': uploader_id,
3624             'internal_id': status_id,
3625             'upload_date': upload_date
3626         }
3627         return [info]
3628
3629 class SteamIE(InfoExtractor):
3630     _VALID_URL = r"""http://store.steampowered.com/
3631                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3632                 (?P<gameID>\d+)/?
3633                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3634                 """
3635
3636     @classmethod
3637     def suitable(cls, url):
3638         """Receives a URL and returns True if suitable for this IE."""
3639         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3640
3641     def _real_extract(self, url):
3642         m = re.match(self._VALID_URL, url, re.VERBOSE)
3643         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3644         gameID = m.group('gameID')
3645         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3646         webpage = self._download_webpage(videourl, gameID)
3647         mweb = re.finditer(urlRE, webpage)
3648         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3649         titles = re.finditer(namesRE, webpage)
3650         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3651         thumbs = re.finditer(thumbsRE, webpage)
3652         videos = []
3653         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3654             video_id = vid.group('videoID')
3655             title = vtitle.group('videoName')
3656             video_url = vid.group('videoURL')
3657             video_thumb = thumb.group('thumbnail')
3658             if not video_url:
3659                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3660             info = {
3661                 'id':video_id,
3662                 'url':video_url,
3663                 'ext': 'flv',
3664                 'title': unescapeHTML(title),
3665                 'thumbnail': video_thumb
3666                   }
3667             videos.append(info)
3668         return videos
3669
3670 class UstreamIE(InfoExtractor):
3671     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3672     IE_NAME = u'ustream'
3673
3674     def _real_extract(self, url):
3675         m = re.match(self._VALID_URL, url)
3676         video_id = m.group('videoID')
3677         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3678         webpage = self._download_webpage(url, video_id)
3679         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3680         title = m.group('title')
3681         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3682         uploader = m.group('uploader')
3683         info = {
3684                 'id':video_id,
3685                 'url':video_url,
3686                 'ext': 'flv',
3687                 'title': title,
3688                 'uploader': uploader
3689                   }
3690         return [info]
3691
3692 class RBMARadioIE(InfoExtractor):
3693     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3694
3695     def _real_extract(self, url):
3696         m = re.match(self._VALID_URL, url)
3697         video_id = m.group('videoID')
3698
3699         webpage = self._download_webpage(url, video_id)
3700         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3701         if not m:
3702             raise ExtractorError(u'Cannot find metadata')
3703         json_data = m.group(1)
3704
3705         try:
3706             data = json.loads(json_data)
3707         except ValueError as e:
3708             raise ExtractorError(u'Invalid JSON: ' + str(e))
3709
3710         video_url = data['akamai_url'] + '&cbr=256'
3711         url_parts = compat_urllib_parse_urlparse(video_url)
3712         video_ext = url_parts.path.rpartition('.')[2]
3713         info = {
3714                 'id': video_id,
3715                 'url': video_url,
3716                 'ext': video_ext,
3717                 'title': data['title'],
3718                 'description': data.get('teaser_text'),
3719                 'location': data.get('country_of_origin'),
3720                 'uploader': data.get('host', {}).get('name'),
3721                 'uploader_id': data.get('host', {}).get('slug'),
3722                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3723                 'duration': data.get('duration'),
3724         }
3725         return [info]
3726
3727
3728 class YouPornIE(InfoExtractor):
3729     """Information extractor for youporn.com."""
3730     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3731
3732     def _print_formats(self, formats):
3733         """Print all available formats"""
3734         print(u'Available formats:')
3735         print(u'ext\t\tformat')
3736         print(u'---------------------------------')
3737         for format in formats:
3738             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3739
3740     def _specific(self, req_format, formats):
3741         for x in formats:
3742             if(x["format"]==req_format):
3743                 return x
3744         return None
3745
3746     def _real_extract(self, url):
3747         mobj = re.match(self._VALID_URL, url)
3748         if mobj is None:
3749             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3750             return
3751
3752         video_id = mobj.group('videoid')
3753
3754         req = compat_urllib_request.Request(url)
3755         req.add_header('Cookie', 'age_verified=1')
3756         webpage = self._download_webpage(req, video_id)
3757
3758         # Get the video title
3759         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3760         if result is None:
3761             raise ExtractorError(u'Unable to extract video title')
3762         video_title = result.group('title').strip()
3763
3764         # Get the video date
3765         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3766         if result is None:
3767             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3768             upload_date = None
3769         else:
3770             upload_date = result.group('date').strip()
3771
3772         # Get the video uploader
3773         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3774         if result is None:
3775             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3776             video_uploader = None
3777         else:
3778             video_uploader = result.group('uploader').strip()
3779             video_uploader = clean_html( video_uploader )
3780
3781         # Get all of the formats available
3782         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3783         result = re.search(DOWNLOAD_LIST_RE, webpage)
3784         if result is None:
3785             raise ExtractorError(u'Unable to extract download list')
3786         download_list_html = result.group('download_list').strip()
3787
3788         # Get all of the links from the page
3789         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3790         links = re.findall(LINK_RE, download_list_html)
3791         if(len(links) == 0):
3792             raise ExtractorError(u'ERROR: no known formats available for video')
3793
3794         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3795
3796         formats = []
3797         for link in links:
3798
3799             # A link looks like this:
3800             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3801             # A path looks like this:
3802             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3803             video_url = unescapeHTML( link )
3804             path = compat_urllib_parse_urlparse( video_url ).path
3805             extension = os.path.splitext( path )[1][1:]
3806             format = path.split('/')[4].split('_')[:2]
3807             size = format[0]
3808             bitrate = format[1]
3809             format = "-".join( format )
3810             title = u'%s-%s-%s' % (video_title, size, bitrate)
3811
3812             formats.append({
3813                 'id': video_id,
3814                 'url': video_url,
3815                 'uploader': video_uploader,
3816                 'upload_date': upload_date,
3817                 'title': title,
3818                 'ext': extension,
3819                 'format': format,
3820                 'thumbnail': None,
3821                 'description': None,
3822                 'player_url': None
3823             })
3824
3825         if self._downloader.params.get('listformats', None):
3826             self._print_formats(formats)
3827             return
3828
3829         req_format = self._downloader.params.get('format', None)
3830         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3831
3832         if req_format is None or req_format == 'best':
3833             return [formats[0]]
3834         elif req_format == 'worst':
3835             return [formats[-1]]
3836         elif req_format in ('-1', 'all'):
3837             return formats
3838         else:
3839             format = self._specific( req_format, formats )
3840             if result is None:
3841                 self._downloader.trouble(u'ERROR: requested format not available')
3842                 return
3843             return [format]
3844
3845
3846
3847 class PornotubeIE(InfoExtractor):
3848     """Information extractor for pornotube.com."""
3849     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3850
3851     def _real_extract(self, url):
3852         mobj = re.match(self._VALID_URL, url)
3853         if mobj is None:
3854             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3855             return
3856
3857         video_id = mobj.group('videoid')
3858         video_title = mobj.group('title')
3859
3860         # Get webpage content
3861         webpage = self._download_webpage(url, video_id)
3862
3863         # Get the video URL
3864         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3865         result = re.search(VIDEO_URL_RE, webpage)
3866         if result is None:
3867             self._downloader.trouble(u'ERROR: unable to extract video url')
3868             return
3869         video_url = compat_urllib_parse.unquote(result.group('url'))
3870
3871         #Get the uploaded date
3872         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3873         result = re.search(VIDEO_UPLOADED_RE, webpage)
3874         if result is None:
3875             self._downloader.trouble(u'ERROR: unable to extract video title')
3876             return
3877         upload_date = result.group('date')
3878
3879         info = {'id': video_id,
3880                 'url': video_url,
3881                 'uploader': None,
3882                 'upload_date': upload_date,
3883                 'title': video_title,
3884                 'ext': 'flv',
3885                 'format': 'flv'}
3886
3887         return [info]
3888
3889 class YouJizzIE(InfoExtractor):
3890     """Information extractor for youjizz.com."""
3891     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3892
3893     def _real_extract(self, url):
3894         mobj = re.match(self._VALID_URL, url)
3895         if mobj is None:
3896             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3897             return
3898
3899         video_id = mobj.group('videoid')
3900
3901         # Get webpage content
3902         webpage = self._download_webpage(url, video_id)
3903
3904         # Get the video title
3905         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3906         if result is None:
3907             raise ExtractorError(u'ERROR: unable to extract video title')
3908         video_title = result.group('title').strip()
3909
3910         # Get the embed page
3911         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3912         if result is None:
3913             raise ExtractorError(u'ERROR: unable to extract embed page')
3914
3915         embed_page_url = result.group(0).strip()
3916         video_id = result.group('videoid')
3917
3918         webpage = self._download_webpage(embed_page_url, video_id)
3919
3920         # Get the video URL
3921         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3922         if result is None:
3923             raise ExtractorError(u'ERROR: unable to extract video url')
3924         video_url = result.group('source')
3925
3926         info = {'id': video_id,
3927                 'url': video_url,
3928                 'title': video_title,
3929                 'ext': 'flv',
3930                 'format': 'flv',
3931                 'player_url': embed_page_url}
3932
3933         return [info]
3934
3935 class EightTracksIE(InfoExtractor):
3936     IE_NAME = '8tracks'
3937     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3938
3939     def _real_extract(self, url):
3940         mobj = re.match(self._VALID_URL, url)
3941         if mobj is None:
3942             raise ExtractorError(u'Invalid URL: %s' % url)
3943         playlist_id = mobj.group('id')
3944
3945         webpage = self._download_webpage(url, playlist_id)
3946
3947         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3948         if not m:
3949             raise ExtractorError(u'Cannot find trax information')
3950         json_like = m.group(1)
3951         data = json.loads(json_like)
3952
3953         session = str(random.randint(0, 1000000000))
3954         mix_id = data['id']
3955         track_count = data['tracks_count']
3956         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3957         next_url = first_url
3958         res = []
3959         for i in itertools.count():
3960             api_json = self._download_webpage(next_url, playlist_id,
3961                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3962                 errnote=u'Failed to download song information')
3963             api_data = json.loads(api_json)
3964             track_data = api_data[u'set']['track']
3965             info = {
3966                 'id': track_data['id'],
3967                 'url': track_data['track_file_stream_url'],
3968                 'title': track_data['performer'] + u' - ' + track_data['name'],
3969                 'raw_title': track_data['name'],
3970                 'uploader_id': data['user']['login'],
3971                 'ext': 'm4a',
3972             }
3973             res.append(info)
3974             if api_data['set']['at_last_track']:
3975                 break
3976             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3977         return res
3978
3979 class KeekIE(InfoExtractor):
3980     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3981     IE_NAME = u'keek'
3982
3983     def _real_extract(self, url):
3984         m = re.match(self._VALID_URL, url)
3985         video_id = m.group('videoID')
3986         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3987         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3988         webpage = self._download_webpage(url, video_id)
3989         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3990         title = unescapeHTML(m.group('title'))
3991         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3992         uploader = unescapeHTML(m.group('uploader'))
3993         info = {
3994                 'id':video_id,
3995                 'url':video_url,
3996                 'ext': 'mp4',
3997                 'title': title,
3998                 'thumbnail': thumbnail,
3999                 'uploader': uploader
4000         }
4001         return [info]
4002
4003 class TEDIE(InfoExtractor):
4004     _VALID_URL=r'''http://www.ted.com/
4005                    (
4006                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4007                         |
4008                         ((?P<type_talk>talks)) # We have a simple talk
4009                    )
4010                    /(?P<name>\w+) # Here goes the name and then ".html"
4011                    '''
4012
4013     @classmethod
4014     def suitable(cls, url):
4015         """Receives a URL and returns True if suitable for this IE."""
4016         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4017
4018     def _real_extract(self, url):
4019         m=re.match(self._VALID_URL, url, re.VERBOSE)
4020         if m.group('type_talk'):
4021             return [self._talk_info(url)]
4022         else :
4023             playlist_id=m.group('playlist_id')
4024             name=m.group('name')
4025             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4026             return self._playlist_videos_info(url,name,playlist_id)
4027
4028     def _talk_video_link(self,mediaSlug):
4029         '''Returns the video link for that mediaSlug'''
4030         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4031
4032     def _playlist_videos_info(self,url,name,playlist_id=0):
4033         '''Returns the videos of the playlist'''
4034         video_RE=r'''
4035                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4036                      ([.\s]*?)data-playlist_item_id="(\d+)"
4037                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4038                      '''
4039         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4040         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4041         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4042         m_names=re.finditer(video_name_RE,webpage)
4043         info=[]
4044         for m_video, m_name in zip(m_videos,m_names):
4045             video_id=m_video.group('video_id')
4046             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4047             info.append(self._talk_info(talk_url,video_id))
4048         return info
4049
4050     def _talk_info(self, url, video_id=0):
4051         """Return the video for the talk in the url"""
4052         m=re.match(self._VALID_URL, url,re.VERBOSE)
4053         videoName=m.group('name')
4054         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4055         # If the url includes the language we get the title translated
4056         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4057         title=re.search(title_RE, webpage).group('title')
4058         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4059                         "id":(?P<videoID>[\d]+).*?
4060                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4061         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4062         thumb_match=re.search(thumb_RE,webpage)
4063         info_match=re.search(info_RE,webpage,re.VERBOSE)
4064         video_id=info_match.group('videoID')
4065         mediaSlug=info_match.group('mediaSlug')
4066         video_url=self._talk_video_link(mediaSlug)
4067         info = {
4068                 'id': video_id,
4069                 'url': video_url,
4070                 'ext': 'mp4',
4071                 'title': title,
4072                 'thumbnail': thumb_match.group('thumbnail')
4073                 }
4074         return info
4075
4076 class MySpassIE(InfoExtractor):
4077     _VALID_URL = r'http://www.myspass.de/.*'
4078
4079     def _real_extract(self, url):
4080         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4081
4082         # video id is the last path element of the URL
4083         # usually there is a trailing slash, so also try the second but last
4084         url_path = compat_urllib_parse_urlparse(url).path
4085         url_parent_path, video_id = os.path.split(url_path)
4086         if not video_id:
4087             _, video_id = os.path.split(url_parent_path)
4088
4089         # get metadata
4090         metadata_url = META_DATA_URL_TEMPLATE % video_id
4091         metadata_text = self._download_webpage(metadata_url, video_id)
4092         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4093
4094         # extract values from metadata
4095         url_flv_el = metadata.find('url_flv')
4096         if url_flv_el is None:
4097             self._downloader.trouble(u'ERROR: unable to extract download url')
4098             return
4099         video_url = url_flv_el.text
4100         extension = os.path.splitext(video_url)[1][1:]
4101         title_el = metadata.find('title')
4102         if title_el is None:
4103             self._downloader.trouble(u'ERROR: unable to extract title')
4104             return
4105         title = title_el.text
4106         format_id_el = metadata.find('format_id')
4107         if format_id_el is None:
4108             format = ext
4109         else:
4110             format = format_id_el.text
4111         description_el = metadata.find('description')
4112         if description_el is not None:
4113             description = description_el.text
4114         else:
4115             description = None
4116         imagePreview_el = metadata.find('imagePreview')
4117         if imagePreview_el is not None:
4118             thumbnail = imagePreview_el.text
4119         else:
4120             thumbnail = None
4121         info = {
4122             'id': video_id,
4123             'url': video_url,
4124             'title': title,
4125             'ext': extension,
4126             'format': format,
4127             'thumbnail': thumbnail,
4128             'description': description
4129         }
4130         return [info]
4131
4132 def gen_extractors():
4133     """ Return a list of an instance of every supported extractor.
4134     The order does matter; the first extractor matched is the one handling the URL.
4135     """
4136     return [
4137         YoutubePlaylistIE(),
4138         YoutubeChannelIE(),
4139         YoutubeUserIE(),
4140         YoutubeSearchIE(),
4141         YoutubeIE(),
4142         MetacafeIE(),
4143         DailymotionIE(),
4144         GoogleSearchIE(),
4145         PhotobucketIE(),
4146         YahooIE(),
4147         YahooSearchIE(),
4148         DepositFilesIE(),
4149         FacebookIE(),
4150         BlipTVUserIE(),
4151         BlipTVIE(),
4152         VimeoIE(),
4153         MyVideoIE(),
4154         ComedyCentralIE(),
4155         EscapistIE(),
4156         CollegeHumorIE(),
4157         XVideosIE(),
4158         SoundcloudIE(),
4159         InfoQIE(),
4160         MixcloudIE(),
4161         StanfordOpenClassroomIE(),
4162         MTVIE(),
4163         YoukuIE(),
4164         XNXXIE(),
4165         YouJizzIE(),
4166         PornotubeIE(),
4167         YouPornIE(),
4168         GooglePlusIE(),
4169         ArteTvIE(),
4170         NBAIE(),
4171         JustinTVIE(),
4172         FunnyOrDieIE(),
4173         TweetReelIE(),
4174         SteamIE(),
4175         UstreamIE(),
4176         RBMARadioIE(),
4177         EightTracksIE(),
4178         KeekIE(),
4179         TEDIE(),
4180         MySpassIE(),
4181         GenericIE()
4182     ]
4183
4184