Merge pull request #715 from joksnet/no_video_results
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23     """Information Extractor class.
24
25     Information extractors are the classes that, given a URL, extract
26     information about the video (or videos) the URL refers to. This
27     information includes the real video URL, the video title, author and
28     others. The information is stored in a dictionary which is then
29     passed to the FileDownloader. The FileDownloader processes this
30     information possibly downloading the video to the file system, among
31     other possible outcomes.
32
33     The dictionaries must include the following fields:
34
35     id:             Video identifier.
36     url:            Final video URL.
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader:       Full name of the video uploader.
46     upload_date:    Video upload date (YYYYMMDD).
47     uploader_id:    Nickname or id of the video uploader.
48     location:       Physical location of the video.
49     player_url:     SWF Player URL (used for rtmpdump).
50     subtitles:      The .srt file contents.
51     urlhandle:      [internal] The urlHandle to be used to download the file,
52                     like returned by urllib.request.urlopen
53
54     The fields should all be Unicode strings.
55
56     Subclasses of this one should re-define the _real_initialize() and
57     _real_extract() methods and define a _VALID_URL regexp.
58     Probably, they should also be added to the list of extractors.
59
60     _real_extract() must return a *list* of information dictionaries as
61     described above.
62
63     Finally, the _WORKING attribute should be set to False for broken IEs
64     in order to warn the users and skip the tests.
65     """
66
67     _ready = False
68     _downloader = None
69     _WORKING = True
70
71     def __init__(self, downloader=None):
72         """Constructor. Receives an optional downloader."""
73         self._ready = False
74         self.set_downloader(downloader)
75
76     def suitable(self, url):
77         """Receives a URL and returns True if suitable for this IE."""
78         return re.match(self._VALID_URL, url) is not None
79
80     def working(self):
81         """Getter method for _WORKING."""
82         return self._WORKING
83
84     def initialize(self):
85         """Initializes an instance (authentication, etc)."""
86         if not self._ready:
87             self._real_initialize()
88             self._ready = True
89
90     def extract(self, url):
91         """Extracts URL information and returns it in list of dicts."""
92         self.initialize()
93         return self._real_extract(url)
94
95     def set_downloader(self, downloader):
96         """Sets the downloader for this IE."""
97         self._downloader = downloader
98
99     def _real_initialize(self):
100         """Real initialization process. Redefine in subclasses."""
101         pass
102
103     def _real_extract(self, url):
104         """Real extraction process. Redefine in subclasses."""
105         pass
106
107     @property
108     def IE_NAME(self):
109         return type(self).__name__[:-2]
110
111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112         """ Returns the response handle """
113         if note is None:
114             note = u'Downloading video webpage'
115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116         try:
117             return compat_urllib_request.urlopen(url_or_request)
118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119             if errnote is None:
120                 errnote = u'Unable to download webpage'
121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124         """ Returns the data of the page as a string """
125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126         webpage_bytes = urlh.read()
127         return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131     """Information extractor for youtube.com."""
132
133     _VALID_URL = r"""^
134                      (
135                          (?:https?://)?                                       # http(s):// (optional)
136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
140                          (?:                                                  # the various things that can precede the ID:
141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
142                              |(?:                                             # or the v= param in all its forms
143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
146                                  v=
147                              )
148                          )?                                                   # optional -> youtube.com/xxxx is OK
149                      )?                                                       # all until now is optional -> you can pass the naked ID
150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
151                      (?(1).+)?                                                # if we found the ID, everything can follow
152                      $"""
153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157     _NETRC_MACHINE = 'youtube'
158     # Listed in order of quality
159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161     _video_extensions = {
162         '13': '3gp',
163         '17': 'mp4',
164         '18': 'mp4',
165         '22': 'mp4',
166         '37': 'mp4',
167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168         '43': 'webm',
169         '44': 'webm',
170         '45': 'webm',
171         '46': 'webm',
172     }
173     _video_dimensions = {
174         '5': '240x400',
175         '6': '???',
176         '13': '???',
177         '17': '144x176',
178         '18': '360x640',
179         '22': '720x1280',
180         '34': '360x640',
181         '35': '480x854',
182         '37': '1080x1920',
183         '38': '3072x4096',
184         '43': '360x640',
185         '44': '480x854',
186         '45': '720x1280',
187         '46': '1080x1920',
188     }
189     IE_NAME = u'youtube'
190
191     def suitable(self, url):
192         """Receives a URL and returns True if suitable for this IE."""
193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195     def report_lang(self):
196         """Report attempt to set language."""
197         self._downloader.to_screen(u'[youtube] Setting language')
198
199     def report_login(self):
200         """Report attempt to log in."""
201         self._downloader.to_screen(u'[youtube] Logging in')
202
203     def report_age_confirmation(self):
204         """Report attempt to confirm age."""
205         self._downloader.to_screen(u'[youtube] Confirming age')
206
207     def report_video_webpage_download(self, video_id):
208         """Report attempt to download video webpage."""
209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211     def report_video_info_webpage_download(self, video_id):
212         """Report attempt to download video info webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215     def report_video_subtitles_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219     def report_information_extraction(self, video_id):
220         """Report attempt to extract video information."""
221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223     def report_unavailable_format(self, video_id, format):
224         """Report extracted video URL."""
225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227     def report_rtmp_download(self):
228         """Indicate the download will use the RTMP protocol."""
229         self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231     def _closed_captions_xml_to_srt(self, xml_string):
232         srt = ''
233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234         # TODO parse xml instead of regex
235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
236             if not dur: dur = '4'
237             start = float(start)
238             end = start + float(dur)
239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241             caption = unescapeHTML(caption)
242             caption = unescapeHTML(caption) # double cycle, intentional
243             srt += str(n+1) + '\n'
244             srt += start + ' --> ' + end + '\n'
245             srt += caption + '\n\n'
246         return srt
247
248     def _extract_subtitles(self, video_id):
249         self.report_video_subtitles_download(video_id)
250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251         try:
252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257         if not srt_lang_list:
258             return (u'WARNING: video has no closed captions', None)
259         if self._downloader.params.get('subtitleslang', False):
260             srt_lang = self._downloader.params.get('subtitleslang')
261         elif 'en' in srt_lang_list:
262             srt_lang = 'en'
263         else:
264             srt_lang = list(srt_lang_list.keys())[0]
265         if not srt_lang in srt_lang_list:
266             return (u'WARNING: no closed captions found in the specified language', None)
267         params = compat_urllib_parse.urlencode({
268             'lang': srt_lang,
269             'name': srt_lang_list[srt_lang].encode('utf-8'),
270             'v': video_id,
271         })
272         url = 'http://www.youtube.com/api/timedtext?' + params
273         try:
274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277         if not srt_xml:
278             return (u'WARNING: Did not fetch video subtitles', None)
279         return (None, self._closed_captions_xml_to_srt(srt_xml))
280
281     def _print_formats(self, formats):
282         print('Available formats:')
283         for x in formats:
284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
285
286     def _real_initialize(self):
287         if self._downloader is None:
288             return
289
290         username = None
291         password = None
292         downloader_params = self._downloader.params
293
294         # Attempt to use provided username and password or .netrc data
295         if downloader_params.get('username', None) is not None:
296             username = downloader_params['username']
297             password = downloader_params['password']
298         elif downloader_params.get('usenetrc', False):
299             try:
300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
301                 if info is not None:
302                     username = info[0]
303                     password = info[2]
304                 else:
305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306             except (IOError, netrc.NetrcParseError) as err:
307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
308                 return
309
310         # Set language
311         request = compat_urllib_request.Request(self._LANG_URL)
312         try:
313             self.report_lang()
314             compat_urllib_request.urlopen(request).read()
315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
317             return
318
319         # No authentication to be performed
320         if username is None:
321             return
322
323         request = compat_urllib_request.Request(self._LOGIN_URL)
324         try:
325             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
328             return
329
330         galx = None
331         dsh = None
332         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
333         if match:
334           galx = match.group(1)
335
336         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           dsh = match.group(1)
339
340         # Log in
341         login_form_strs = {
342                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
343                 u'Email': username,
344                 u'GALX': galx,
345                 u'Passwd': password,
346                 u'PersistentCookie': u'yes',
347                 u'_utf8': u'霱',
348                 u'bgresponse': u'js_disabled',
349                 u'checkConnection': u'',
350                 u'checkedDomains': u'youtube',
351                 u'dnConn': u'',
352                 u'dsh': dsh,
353                 u'pstMsg': u'0',
354                 u'rmShown': u'1',
355                 u'secTok': u'',
356                 u'signIn': u'Sign in',
357                 u'timeStmp': u'',
358                 u'service': u'youtube',
359                 u'uilel': u'3',
360                 u'hl': u'en_US',
361         }
362         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
363         # chokes on unicode
364         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
367         try:
368             self.report_login()
369             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
372                 return
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
375             return
376
377         # Confirm age
378         age_form = {
379                 'next_url':     '/',
380                 'action_confirm':   'Confirm',
381                 }
382         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
383         try:
384             self.report_age_confirmation()
385             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
388             return
389
390     def _extract_id(self, url):
391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
392         if mobj is None:
393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
394             return
395         video_id = mobj.group(2)
396         return video_id
397
398     def _real_extract(self, url):
399         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400         mobj = re.search(self._NEXT_URL_RE, url)
401         if mobj:
402             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403         video_id = self._extract_id(url)
404
405         # Get video webpage
406         self.report_video_webpage_download(video_id)
407         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408         request = compat_urllib_request.Request(url)
409         try:
410             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
413             return
414
415         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
416
417         # Attempt to extract SWF player URL
418         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
419         if mobj is not None:
420             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
421         else:
422             player_url = None
423
424         # Get video info
425         self.report_video_info_webpage_download(video_id)
426         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428                     % (video_id, el_type))
429             request = compat_urllib_request.Request(video_info_url)
430             try:
431                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433                 video_info = compat_parse_qs(video_info_webpage)
434                 if 'token' in video_info:
435                     break
436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
438                 return
439         if 'token' not in video_info:
440             if 'reason' in video_info:
441                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
442             else:
443                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
444             return
445
446         # Check for "rental" videos
447         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448             self._downloader.trouble(u'ERROR: "rental" videos not supported')
449             return
450
451         # Start extracting information
452         self.report_information_extraction(video_id)
453
454         # uploader
455         if 'author' not in video_info:
456             self._downloader.trouble(u'ERROR: unable to extract uploader name')
457             return
458         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
459
460         # uploader_id
461         video_uploader_id = None
462         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
463         if mobj is not None:
464             video_uploader_id = mobj.group(1)
465         else:
466             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
467
468         # title
469         if 'title' not in video_info:
470             self._downloader.trouble(u'ERROR: unable to extract video title')
471             return
472         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
473
474         # thumbnail image
475         if 'thumbnail_url' not in video_info:
476             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
477             video_thumbnail = ''
478         else:   # don't panic if we can't find it
479             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
480
481         # upload date
482         upload_date = None
483         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
484         if mobj is not None:
485             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487             for expression in format_expressions:
488                 try:
489                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
490                 except:
491                     pass
492
493         # description
494         video_description = get_element_by_id("eow-description", video_webpage)
495         if video_description:
496             video_description = clean_html(video_description)
497         else:
498             video_description = ''
499
500         # closed captions
501         video_subtitles = None
502         if self._downloader.params.get('writesubtitles', False):
503             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
504             if srt_error:
505                 self._downloader.trouble(srt_error)
506
507         if 'length_seconds' not in video_info:
508             self._downloader.trouble(u'WARNING: unable to extract video duration')
509             video_duration = ''
510         else:
511             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
512
513         # token
514         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
515
516         # Decide which formats to download
517         req_format = self._downloader.params.get('format', None)
518
519         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520             self.report_rtmp_download()
521             video_url_list = [(None, video_info['conn'][0])]
522         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
527
528             format_limit = self._downloader.params.get('format_limit', None)
529             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530             if format_limit is not None and format_limit in available_formats:
531                 format_list = available_formats[available_formats.index(format_limit):]
532             else:
533                 format_list = available_formats
534             existing_formats = [x for x in format_list if x in url_map]
535             if len(existing_formats) == 0:
536                 self._downloader.trouble(u'ERROR: no known formats available for video')
537                 return
538             if self._downloader.params.get('listformats', None):
539                 self._print_formats(existing_formats)
540                 return
541             if req_format is None or req_format == 'best':
542                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543             elif req_format == 'worst':
544                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545             elif req_format in ('-1', 'all'):
546                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
547             else:
548                 # Specific formats. We pick the first in a slash-delimeted sequence.
549                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550                 req_formats = req_format.split('/')
551                 video_url_list = None
552                 for rf in req_formats:
553                     if rf in url_map:
554                         video_url_list = [(rf, url_map[rf])]
555                         break
556                 if video_url_list is None:
557                     self._downloader.trouble(u'ERROR: requested format not available')
558                     return
559         else:
560             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
561             return
562
563         results = []
564         for format_param, video_real_url in video_url_list:
565             # Extension
566             video_extension = self._video_extensions.get(format_param, 'flv')
567
568             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569                                               self._video_dimensions.get(format_param, '???'))
570
571             results.append({
572                 'id':       video_id,
573                 'url':      video_real_url,
574                 'uploader': video_uploader,
575                 'uploader_id': video_uploader_id,
576                 'upload_date':  upload_date,
577                 'title':    video_title,
578                 'ext':      video_extension,
579                 'format':   video_format,
580                 'thumbnail':    video_thumbnail,
581                 'description':  video_description,
582                 'player_url':   player_url,
583                 'subtitles':    video_subtitles,
584                 'duration':     video_duration
585             })
586         return results
587
588
589 class MetacafeIE(InfoExtractor):
590     """Information Extractor for metacafe.com."""
591
592     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595     IE_NAME = u'metacafe'
596
597     def __init__(self, downloader=None):
598         InfoExtractor.__init__(self, downloader)
599
600     def report_disclaimer(self):
601         """Report disclaimer retrieval."""
602         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
603
604     def report_age_confirmation(self):
605         """Report attempt to confirm age."""
606         self._downloader.to_screen(u'[metacafe] Confirming age')
607
608     def report_download_webpage(self, video_id):
609         """Report webpage download."""
610         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
611
612     def report_extraction(self, video_id):
613         """Report information extraction."""
614         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
615
616     def _real_initialize(self):
617         # Retrieve disclaimer
618         request = compat_urllib_request.Request(self._DISCLAIMER)
619         try:
620             self.report_disclaimer()
621             disclaimer = compat_urllib_request.urlopen(request).read()
622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
624             return
625
626         # Confirm age
627         disclaimer_form = {
628             'filters': '0',
629             'submit': "Continue - I'm over 18",
630             }
631         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
632         try:
633             self.report_age_confirmation()
634             disclaimer = compat_urllib_request.urlopen(request).read()
635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
637             return
638
639     def _real_extract(self, url):
640         # Extract id and simplified title from URL
641         mobj = re.match(self._VALID_URL, url)
642         if mobj is None:
643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
644             return
645
646         video_id = mobj.group(1)
647
648         # Check if video comes from YouTube
649         mobj2 = re.match(r'^yt-(.*)$', video_id)
650         if mobj2 is not None:
651             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
652             return
653
654         # Retrieve video webpage to extract further information
655         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
656         try:
657             self.report_download_webpage(video_id)
658             webpage = compat_urllib_request.urlopen(request).read()
659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
661             return
662
663         # Extract URL, uploader and title from webpage
664         self.report_extraction(video_id)
665         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
666         if mobj is not None:
667             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668             video_extension = mediaURL[-3:]
669
670             # Extract gdaKey if available
671             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
672             if mobj is None:
673                 video_url = mediaURL
674             else:
675                 gdaKey = mobj.group(1)
676                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
677         else:
678             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
679             if mobj is None:
680                 self._downloader.trouble(u'ERROR: unable to extract media URL')
681                 return
682             vardict = compat_parse_qs(mobj.group(1))
683             if 'mediaData' not in vardict:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
687             if mobj is None:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mediaURL = mobj.group(1).replace('\\/', '/')
691             video_extension = mediaURL[-3:]
692             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
693
694         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
695         if mobj is None:
696             self._downloader.trouble(u'ERROR: unable to extract title')
697             return
698         video_title = mobj.group(1).decode('utf-8')
699
700         mobj = re.search(r'submitter=(.*?);', webpage)
701         if mobj is None:
702             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
703             return
704         video_uploader = mobj.group(1)
705
706         return [{
707             'id':       video_id.decode('utf-8'),
708             'url':      video_url.decode('utf-8'),
709             'uploader': video_uploader.decode('utf-8'),
710             'upload_date':  None,
711             'title':    video_title,
712             'ext':      video_extension.decode('utf-8'),
713         }]
714
715
716 class DailymotionIE(InfoExtractor):
717     """Information Extractor for Dailymotion"""
718
719     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720     IE_NAME = u'dailymotion'
721     _WORKING = False
722
723     def __init__(self, downloader=None):
724         InfoExtractor.__init__(self, downloader)
725
726     def report_extraction(self, video_id):
727         """Report information extraction."""
728         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
729
730     def _real_extract(self, url):
731         # Extract id and simplified title from URL
732         mobj = re.match(self._VALID_URL, url)
733         if mobj is None:
734             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
735             return
736
737         video_id = mobj.group(1).split('_')[0].split('?')[0]
738
739         video_extension = 'mp4'
740
741         # Retrieve video webpage to extract further information
742         request = compat_urllib_request.Request(url)
743         request.add_header('Cookie', 'family_filter=off')
744         webpage = self._download_webpage(request, video_id)
745
746         # Extract URL, uploader and title from webpage
747         self.report_extraction(video_id)
748         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
749         if mobj is None:
750             self._downloader.trouble(u'ERROR: unable to extract media URL')
751             return
752         flashvars = compat_urllib_parse.unquote(mobj.group(1))
753
754         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
755             if key in flashvars:
756                 max_quality = key
757                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
758                 break
759         else:
760             self._downloader.trouble(u'ERROR: unable to extract video URL')
761             return
762
763         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
764         if mobj is None:
765             self._downloader.trouble(u'ERROR: unable to extract video URL')
766             return
767
768         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
769
770         # TODO: support choosing qualities
771
772         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
773         if mobj is None:
774             self._downloader.trouble(u'ERROR: unable to extract title')
775             return
776         video_title = unescapeHTML(mobj.group('title'))
777
778         video_uploader = None
779         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
780         if mobj is None:
781             # lookin for official user
782             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783             if mobj_official is None:
784                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
785             else:
786                 video_uploader = mobj_official.group(1)
787         else:
788             video_uploader = mobj.group(1)
789
790         video_upload_date = None
791         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
792         if mobj is not None:
793             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
794
795         return [{
796             'id':       video_id,
797             'url':      video_url,
798             'uploader': video_uploader,
799             'upload_date':  video_upload_date,
800             'title':    video_title,
801             'ext':      video_extension,
802         }]
803
804
805 class PhotobucketIE(InfoExtractor):
806     """Information extractor for photobucket.com."""
807
808     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809     IE_NAME = u'photobucket'
810
811     def __init__(self, downloader=None):
812         InfoExtractor.__init__(self, downloader)
813
814     def report_download_webpage(self, video_id):
815         """Report webpage download."""
816         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
817
818     def report_extraction(self, video_id):
819         """Report information extraction."""
820         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
821
822     def _real_extract(self, url):
823         # Extract id from URL
824         mobj = re.match(self._VALID_URL, url)
825         if mobj is None:
826             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
827             return
828
829         video_id = mobj.group(1)
830
831         video_extension = 'flv'
832
833         # Retrieve video webpage to extract further information
834         request = compat_urllib_request.Request(url)
835         try:
836             self.report_download_webpage(video_id)
837             webpage = compat_urllib_request.urlopen(request).read()
838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
840             return
841
842         # Extract URL, uploader, and title from webpage
843         self.report_extraction(video_id)
844         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
845         if mobj is None:
846             self._downloader.trouble(u'ERROR: unable to extract media URL')
847             return
848         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
849
850         video_url = mediaURL
851
852         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
853         if mobj is None:
854             self._downloader.trouble(u'ERROR: unable to extract title')
855             return
856         video_title = mobj.group(1).decode('utf-8')
857
858         video_uploader = mobj.group(2).decode('utf-8')
859
860         return [{
861             'id':       video_id.decode('utf-8'),
862             'url':      video_url.decode('utf-8'),
863             'uploader': video_uploader,
864             'upload_date':  None,
865             'title':    video_title,
866             'ext':      video_extension.decode('utf-8'),
867         }]
868
869
870 class YahooIE(InfoExtractor):
871     """Information extractor for video.yahoo.com."""
872
873     _WORKING = False
874     # _VALID_URL matches all Yahoo! Video URLs
875     # _VPAGE_URL matches only the extractable '/watch/' URLs
876     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878     IE_NAME = u'video.yahoo'
879
880     def __init__(self, downloader=None):
881         InfoExtractor.__init__(self, downloader)
882
883     def report_download_webpage(self, video_id):
884         """Report webpage download."""
885         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
886
887     def report_extraction(self, video_id):
888         """Report information extraction."""
889         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
890
891     def _real_extract(self, url, new_video=True):
892         # Extract ID from URL
893         mobj = re.match(self._VALID_URL, url)
894         if mobj is None:
895             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
896             return
897
898         video_id = mobj.group(2)
899         video_extension = 'flv'
900
901         # Rewrite valid but non-extractable URLs as
902         # extractable English language /watch/ URLs
903         if re.match(self._VPAGE_URL, url) is None:
904             request = compat_urllib_request.Request(url)
905             try:
906                 webpage = compat_urllib_request.urlopen(request).read()
907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
909                 return
910
911             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
912             if mobj is None:
913                 self._downloader.trouble(u'ERROR: Unable to extract id field')
914                 return
915             yahoo_id = mobj.group(1)
916
917             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
918             if mobj is None:
919                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
920                 return
921             yahoo_vid = mobj.group(1)
922
923             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924             return self._real_extract(url, new_video=False)
925
926         # Retrieve video webpage to extract further information
927         request = compat_urllib_request.Request(url)
928         try:
929             self.report_download_webpage(video_id)
930             webpage = compat_urllib_request.urlopen(request).read()
931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933             return
934
935         # Extract uploader and title from webpage
936         self.report_extraction(video_id)
937         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
938         if mobj is None:
939             self._downloader.trouble(u'ERROR: unable to extract video title')
940             return
941         video_title = mobj.group(1).decode('utf-8')
942
943         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
944         if mobj is None:
945             self._downloader.trouble(u'ERROR: unable to extract video uploader')
946             return
947         video_uploader = mobj.group(1).decode('utf-8')
948
949         # Extract video thumbnail
950         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
951         if mobj is None:
952             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
953             return
954         video_thumbnail = mobj.group(1).decode('utf-8')
955
956         # Extract video description
957         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
958         if mobj is None:
959             self._downloader.trouble(u'ERROR: unable to extract video description')
960             return
961         video_description = mobj.group(1).decode('utf-8')
962         if not video_description:
963             video_description = 'No description available.'
964
965         # Extract video height and width
966         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
967         if mobj is None:
968             self._downloader.trouble(u'ERROR: unable to extract video height')
969             return
970         yv_video_height = mobj.group(1)
971
972         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
973         if mobj is None:
974             self._downloader.trouble(u'ERROR: unable to extract video width')
975             return
976         yv_video_width = mobj.group(1)
977
978         # Retrieve video playlist to extract media URL
979         # I'm not completely sure what all these options are, but we
980         # seem to need most of them, otherwise the server sends a 401.
981         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
982         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
983         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
986         try:
987             self.report_download_webpage(video_id)
988             webpage = compat_urllib_request.urlopen(request).read()
989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
991             return
992
993         # Extract media URL from playlist XML
994         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
995         if mobj is None:
996             self._downloader.trouble(u'ERROR: Unable to extract media URL')
997             return
998         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999         video_url = unescapeHTML(video_url)
1000
1001         return [{
1002             'id':       video_id.decode('utf-8'),
1003             'url':      video_url,
1004             'uploader': video_uploader,
1005             'upload_date':  None,
1006             'title':    video_title,
1007             'ext':      video_extension.decode('utf-8'),
1008             'thumbnail':    video_thumbnail.decode('utf-8'),
1009             'description':  video_description,
1010         }]
1011
1012
1013 class VimeoIE(InfoExtractor):
1014     """Information extractor for vimeo.com."""
1015
1016     # _VALID_URL matches Vimeo URLs
1017     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1018     IE_NAME = u'vimeo'
1019
1020     def __init__(self, downloader=None):
1021         InfoExtractor.__init__(self, downloader)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026
1027     def report_extraction(self, video_id):
1028         """Report information extraction."""
1029         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1036             return
1037
1038         video_id = mobj.group('id')
1039         if not mobj.group('proto'):
1040             url = 'https://' + url
1041         if mobj.group('direct_link'):
1042             url = 'https://vimeo.com/' + video_id
1043
1044         # Retrieve video webpage to extract further information
1045         request = compat_urllib_request.Request(url, None, std_headers)
1046         try:
1047             self.report_download_webpage(video_id)
1048             webpage_bytes = compat_urllib_request.urlopen(request).read()
1049             webpage = webpage_bytes.decode('utf-8')
1050         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052             return
1053
1054         # Now we begin extracting as much information as we can from what we
1055         # retrieved. First we extract the information common to all extractors,
1056         # and latter we extract those that are Vimeo specific.
1057         self.report_extraction(video_id)
1058
1059         # Extract the config JSON
1060         try:
1061             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062             config = json.loads(config)
1063         except:
1064             self._downloader.trouble(u'ERROR: unable to extract info section')
1065             return
1066
1067         # Extract title
1068         video_title = config["video"]["title"]
1069
1070         # Extract uploader and uploader_id
1071         video_uploader = config["video"]["owner"]["name"]
1072         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1073
1074         # Extract video thumbnail
1075         video_thumbnail = config["video"]["thumbnail"]
1076
1077         # Extract video description
1078         video_description = get_element_by_attribute("itemprop", "description", webpage)
1079         if video_description: video_description = clean_html(video_description)
1080         else: video_description = ''
1081
1082         # Extract upload date
1083         video_upload_date = None
1084         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085         if mobj is not None:
1086             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1087
1088         # Vimeo specific: extract request signature and timestamp
1089         sig = config['request']['signature']
1090         timestamp = config['request']['timestamp']
1091
1092         # Vimeo specific: extract video codec and quality information
1093         # First consider quality, then codecs, then take everything
1094         # TODO bind to format param
1095         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096         files = { 'hd': [], 'sd': [], 'other': []}
1097         for codec_name, codec_extension in codecs:
1098             if codec_name in config["video"]["files"]:
1099                 if 'hd' in config["video"]["files"][codec_name]:
1100                     files['hd'].append((codec_name, codec_extension, 'hd'))
1101                 elif 'sd' in config["video"]["files"][codec_name]:
1102                     files['sd'].append((codec_name, codec_extension, 'sd'))
1103                 else:
1104                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105
1106         for quality in ('hd', 'sd', 'other'):
1107             if len(files[quality]) > 0:
1108                 video_quality = files[quality][0][2]
1109                 video_codec = files[quality][0][0]
1110                 video_extension = files[quality][0][1]
1111                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1112                 break
1113         else:
1114             self._downloader.trouble(u'ERROR: no known codec found')
1115             return
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def __init__(self, downloader=None):
1142         InfoExtractor.__init__(self, downloader)
1143
1144     def report_download_webpage(self, video_id):
1145         """Report webpage download."""
1146         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147
1148     def report_extraction(self, video_id):
1149         """Report information extraction."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151
1152     def fetch_webpage(self, url):
1153         request = compat_urllib_request.Request(url)
1154         try:
1155             self.report_download_webpage(url)
1156             webpage = compat_urllib_request.urlopen(request).read()
1157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159             return
1160         except ValueError as err:
1161             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1162             return
1163         return webpage
1164
1165     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166         page = self.fetch_webpage(url)
1167         mobj = re.search(regex, page, regexFlags)
1168         info = {}
1169
1170         if mobj is None:
1171             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172             return
1173
1174         for (i, key, err) in matchTuples:
1175             if mobj.group(i) is None:
1176                 self._downloader.trouble(err)
1177                 return
1178             else:
1179                 info[key] = mobj.group(i)
1180
1181         return info
1182
1183     def extractLiveStream(self, url):
1184         video_lang = url.split('/')[-4]
1185         info = self.grep_webpage(
1186             url,
1187             r'src="(.*?/videothek_js.*?\.js)',
1188             0,
1189             [
1190                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1191             ]
1192         )
1193         http_host = url.split('/')[2]
1194         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195         info = self.grep_webpage(
1196             next_url,
1197             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198                 '(http://.*?\.swf).*?' +
1199                 '(rtmp://.*?)\'',
1200             re.DOTALL,
1201             [
1202                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1203                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1205             ]
1206         )
1207         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208
1209     def extractPlus7Stream(self, url):
1210         video_lang = url.split('/')[-3]
1211         info = self.grep_webpage(
1212             url,
1213             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1214             0,
1215             [
1216                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1217             ]
1218         )
1219         next_url = compat_urllib_parse.unquote(info.get('url'))
1220         info = self.grep_webpage(
1221             next_url,
1222             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1223             0,
1224             [
1225                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1226             ]
1227         )
1228         next_url = compat_urllib_parse.unquote(info.get('url'))
1229
1230         info = self.grep_webpage(
1231             next_url,
1232             r'<video id="(.*?)".*?>.*?' +
1233                 '<name>(.*?)</name>.*?' +
1234                 '<dateVideo>(.*?)</dateVideo>.*?' +
1235                 '<url quality="hd">(.*?)</url>',
1236             re.DOTALL,
1237             [
1238                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1239                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1241                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1242             ]
1243         )
1244
1245         return {
1246             'id':           info.get('id'),
1247             'url':          compat_urllib_parse.unquote(info.get('url')),
1248             'uploader':     u'arte.tv',
1249             'upload_date':  info.get('date'),
1250             'title':        info.get('title').decode('utf-8'),
1251             'ext':          u'mp4',
1252             'format':       u'NA',
1253             'player_url':   None,
1254         }
1255
1256     def _real_extract(self, url):
1257         video_id = url.split('/')[-1]
1258         self.report_extraction(video_id)
1259
1260         if re.search(self._LIVE_URL, video_id) is not None:
1261             self.extractLiveStream(url)
1262             return
1263         else:
1264             info = self.extractPlus7Stream(url)
1265
1266         return [info]
1267
1268
1269 class GenericIE(InfoExtractor):
1270     """Generic last-resort information extractor."""
1271
1272     _VALID_URL = r'.*'
1273     IE_NAME = u'generic'
1274
1275     def __init__(self, downloader=None):
1276         InfoExtractor.__init__(self, downloader)
1277
1278     def report_download_webpage(self, video_id):
1279         """Report webpage download."""
1280         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282
1283     def report_extraction(self, video_id):
1284         """Report information extraction."""
1285         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286
1287     def report_following_redirect(self, new_url):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1290
1291     def _test_redirect(self, url):
1292         """Check if it is a redirect, like url shorteners, in case restart chain."""
1293         class HeadRequest(compat_urllib_request.Request):
1294             def get_method(self):
1295                 return "HEAD"
1296
1297         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298             """
1299             Subclass the HTTPRedirectHandler to make it use our
1300             HeadRequest also on the redirected URL
1301             """
1302             def redirect_request(self, req, fp, code, msg, headers, newurl):
1303                 if code in (301, 302, 303, 307):
1304                     newurl = newurl.replace(' ', '%20')
1305                     newheaders = dict((k,v) for k,v in req.headers.items()
1306                                       if k.lower() not in ("content-length", "content-type"))
1307                     return HeadRequest(newurl,
1308                                        headers=newheaders,
1309                                        origin_req_host=req.get_origin_req_host(),
1310                                        unverifiable=True)
1311                 else:
1312                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1313
1314         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315             """
1316             Fallback to GET if HEAD is not allowed (405 HTTP error)
1317             """
1318             def http_error_405(self, req, fp, code, msg, headers):
1319                 fp.read()
1320                 fp.close()
1321
1322                 newheaders = dict((k,v) for k,v in req.headers.items()
1323                                   if k.lower() not in ("content-length", "content-type"))
1324                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325                                                  headers=newheaders,
1326                                                  origin_req_host=req.get_origin_req_host(),
1327                                                  unverifiable=True))
1328
1329         # Build our opener
1330         opener = compat_urllib_request.OpenerDirector()
1331         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332                         HTTPMethodFallback, HEADRedirectHandler,
1333                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334             opener.add_handler(handler())
1335
1336         response = opener.open(HeadRequest(url))
1337         new_url = response.geturl()
1338
1339         if url == new_url:
1340             return False
1341
1342         self.report_following_redirect(new_url)
1343         self._downloader.download([new_url])
1344         return True
1345
1346     def _real_extract(self, url):
1347         if self._test_redirect(url): return
1348
1349         video_id = url.split('/')[-1]
1350         request = compat_urllib_request.Request(url)
1351         try:
1352             self.report_download_webpage(video_id)
1353             webpage = compat_urllib_request.urlopen(request).read()
1354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356             return
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             # Broaden the search a little bit: JWPlayer JS loader
1371             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1372         if mobj is None:
1373             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1374             return
1375
1376         # It's possible that one of the regexes
1377         # matched, but returned an empty group:
1378         if mobj.group(1) is None:
1379             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380             return
1381
1382         video_url = compat_urllib_parse.unquote(mobj.group(1))
1383         video_id = os.path.basename(video_url)
1384
1385         # here's a fun little line of code for you:
1386         video_extension = os.path.splitext(video_id)[1][1:]
1387         video_id = os.path.splitext(video_id)[0]
1388
1389         # it's tempting to parse this further, but you would
1390         # have to take into account all the variations like
1391         #   Video Title - Site Name
1392         #   Site Name | Video Title
1393         #   Video Title - Tagline | Site Name
1394         # and so on and so forth; it's just not practical
1395         mobj = re.search(r'<title>(.*)</title>', webpage)
1396         if mobj is None:
1397             self._downloader.trouble(u'ERROR: unable to extract title')
1398             return
1399         video_title = mobj.group(1)
1400
1401         # video uploader is domain name
1402         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1403         if mobj is None:
1404             self._downloader.trouble(u'ERROR: unable to extract title')
1405             return
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(InfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422     _max_youtube_results = 1000
1423     IE_NAME = u'youtube:search'
1424
1425     def __init__(self, downloader=None):
1426         InfoExtractor.__init__(self, downloader)
1427
1428     def report_download_page(self, query, pagenum):
1429         """Report attempt to download search page with given number."""
1430         query = query.decode(preferredencoding())
1431         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1432
1433     def _real_extract(self, query):
1434         mobj = re.match(self._VALID_URL, query)
1435         if mobj is None:
1436             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1437             return
1438
1439         prefix, query = query.split(':')
1440         prefix = prefix[8:]
1441         query = query.encode('utf-8')
1442         if prefix == '':
1443             self._download_n_results(query, 1)
1444             return
1445         elif prefix == 'all':
1446             self._download_n_results(query, self._max_youtube_results)
1447             return
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 self._download_n_results(query, n)
1458                 return
1459             except ValueError: # parsing prefix as integer fails
1460                 self._download_n_results(query, 1)
1461                 return
1462
1463     def _download_n_results(self, query, n):
1464         """Downloads a specified number of results for a query"""
1465
1466         video_ids = []
1467         pagenum = 0
1468         limit = n
1469
1470         while (50 * pagenum) < limit:
1471             self.report_download_page(query, pagenum+1)
1472             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473             request = compat_urllib_request.Request(result_url)
1474             try:
1475                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1478                 return
1479             api_response = json.loads(data)['data']
1480
1481             if not 'items' in api_response:
1482                 self._downloader.trouble(u'[youtube] No video results')
1483                 return
1484
1485             new_ids = list(video['id'] for video in api_response['items'])
1486             video_ids += new_ids
1487
1488             limit = min(n, api_response['totalItems'])
1489             pagenum += 1
1490
1491         if len(video_ids) > n:
1492             video_ids = video_ids[:n]
1493         for id in video_ids:
1494             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1495         return
1496
1497
1498 class GoogleSearchIE(InfoExtractor):
1499     """Information Extractor for Google Video search queries."""
1500     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504     _max_google_results = 1000
1505     IE_NAME = u'video.google:search'
1506
1507     def __init__(self, downloader=None):
1508         InfoExtractor.__init__(self, downloader)
1509
1510     def report_download_page(self, query, pagenum):
1511         """Report attempt to download playlist page with given number."""
1512         query = query.decode(preferredencoding())
1513         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1514
1515     def _real_extract(self, query):
1516         mobj = re.match(self._VALID_URL, query)
1517         if mobj is None:
1518             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1519             return
1520
1521         prefix, query = query.split(':')
1522         prefix = prefix[8:]
1523         query = query.encode('utf-8')
1524         if prefix == '':
1525             self._download_n_results(query, 1)
1526             return
1527         elif prefix == 'all':
1528             self._download_n_results(query, self._max_google_results)
1529             return
1530         else:
1531             try:
1532                 n = int(prefix)
1533                 if n <= 0:
1534                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1535                     return
1536                 elif n > self._max_google_results:
1537                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1538                     n = self._max_google_results
1539                 self._download_n_results(query, n)
1540                 return
1541             except ValueError: # parsing prefix as integer fails
1542                 self._download_n_results(query, 1)
1543                 return
1544
1545     def _download_n_results(self, query, n):
1546         """Downloads a specified number of results for a query"""
1547
1548         video_ids = []
1549         pagenum = 0
1550
1551         while True:
1552             self.report_download_page(query, pagenum)
1553             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1554             request = compat_urllib_request.Request(result_url)
1555             try:
1556                 page = compat_urllib_request.urlopen(request).read()
1557             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1558                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1559                 return
1560
1561             # Extract video identifiers
1562             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1563                 video_id = mobj.group(1)
1564                 if video_id not in video_ids:
1565                     video_ids.append(video_id)
1566                     if len(video_ids) == n:
1567                         # Specified n videos reached
1568                         for id in video_ids:
1569                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1570                         return
1571
1572             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573                 for id in video_ids:
1574                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575                 return
1576
1577             pagenum = pagenum + 1
1578
1579
1580 class YahooSearchIE(InfoExtractor):
1581     """Information Extractor for Yahoo! Video search queries."""
1582
1583     _WORKING = False
1584     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587     _MORE_PAGES_INDICATOR = r'\s*Next'
1588     _max_yahoo_results = 1000
1589     IE_NAME = u'video.yahoo:search'
1590
1591     def __init__(self, downloader=None):
1592         InfoExtractor.__init__(self, downloader)
1593
1594     def report_download_page(self, query, pagenum):
1595         """Report attempt to download playlist page with given number."""
1596         query = query.decode(preferredencoding())
1597         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1598
1599     def _real_extract(self, query):
1600         mobj = re.match(self._VALID_URL, query)
1601         if mobj is None:
1602             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1603             return
1604
1605         prefix, query = query.split(':')
1606         prefix = prefix[8:]
1607         query = query.encode('utf-8')
1608         if prefix == '':
1609             self._download_n_results(query, 1)
1610             return
1611         elif prefix == 'all':
1612             self._download_n_results(query, self._max_yahoo_results)
1613             return
1614         else:
1615             try:
1616                 n = int(prefix)
1617                 if n <= 0:
1618                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619                     return
1620                 elif n > self._max_yahoo_results:
1621                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622                     n = self._max_yahoo_results
1623                 self._download_n_results(query, n)
1624                 return
1625             except ValueError: # parsing prefix as integer fails
1626                 self._download_n_results(query, 1)
1627                 return
1628
1629     def _download_n_results(self, query, n):
1630         """Downloads a specified number of results for a query"""
1631
1632         video_ids = []
1633         already_seen = set()
1634         pagenum = 1
1635
1636         while True:
1637             self.report_download_page(query, pagenum)
1638             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639             request = compat_urllib_request.Request(result_url)
1640             try:
1641                 page = compat_urllib_request.urlopen(request).read()
1642             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1644                 return
1645
1646             # Extract video identifiers
1647             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648                 video_id = mobj.group(1)
1649                 if video_id not in already_seen:
1650                     video_ids.append(video_id)
1651                     already_seen.add(video_id)
1652                     if len(video_ids) == n:
1653                         # Specified n videos reached
1654                         for id in video_ids:
1655                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656                         return
1657
1658             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659                 for id in video_ids:
1660                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661                 return
1662
1663             pagenum = pagenum + 1
1664
1665
1666 class YoutubePlaylistIE(InfoExtractor):
1667     """Information Extractor for YouTube playlists."""
1668
1669     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1670     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1671     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1672     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1673     IE_NAME = u'youtube:playlist'
1674
1675     def __init__(self, downloader=None):
1676         InfoExtractor.__init__(self, downloader)
1677
1678     def report_download_page(self, playlist_id, pagenum):
1679         """Report attempt to download playlist page with given number."""
1680         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1681
1682     def _real_extract(self, url):
1683         # Extract playlist id
1684         mobj = re.match(self._VALID_URL, url)
1685         if mobj is None:
1686             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1687             return
1688
1689         # Single video case
1690         if mobj.group(3) is not None:
1691             self._downloader.download([mobj.group(3)])
1692             return
1693
1694         # Download playlist pages
1695         # prefix is 'p' as default for playlists but there are other types that need extra care
1696         playlist_prefix = mobj.group(1)
1697         if playlist_prefix == 'a':
1698             playlist_access = 'artist'
1699         else:
1700             playlist_prefix = 'p'
1701             playlist_access = 'view_play_list'
1702         playlist_id = mobj.group(2)
1703         video_ids = []
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(playlist_id, pagenum)
1708             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1709             request = compat_urllib_request.Request(url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             ids_in_page = []
1718             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1719                 if mobj.group(1) not in ids_in_page:
1720                     ids_in_page.append(mobj.group(1))
1721             video_ids.extend(ids_in_page)
1722
1723             if self._MORE_PAGES_INDICATOR not in page:
1724                 break
1725             pagenum = pagenum + 1
1726
1727         total = len(video_ids)
1728
1729         playliststart = self._downloader.params.get('playliststart', 1) - 1
1730         playlistend = self._downloader.params.get('playlistend', -1)
1731         if playlistend == -1:
1732             video_ids = video_ids[playliststart:]
1733         else:
1734             video_ids = video_ids[playliststart:playlistend]
1735
1736         if len(video_ids) == total:
1737             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1738         else:
1739             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1740
1741         for id in video_ids:
1742             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1743         return
1744
1745
1746 class YoutubeChannelIE(InfoExtractor):
1747     """Information Extractor for YouTube channels."""
1748
1749     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1750     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1752     IE_NAME = u'youtube:channel'
1753
1754     def report_download_page(self, channel_id, pagenum):
1755         """Report attempt to download channel page with given number."""
1756         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1757
1758     def _real_extract(self, url):
1759         # Extract channel id
1760         mobj = re.match(self._VALID_URL, url)
1761         if mobj is None:
1762             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1763             return
1764
1765         # Download channel pages
1766         channel_id = mobj.group(1)
1767         video_ids = []
1768         pagenum = 1
1769
1770         while True:
1771             self.report_download_page(channel_id, pagenum)
1772             url = self._TEMPLATE_URL % (channel_id, pagenum)
1773             request = compat_urllib_request.Request(url)
1774             try:
1775                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1776             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1778                 return
1779
1780             # Extract video identifiers
1781             ids_in_page = []
1782             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1783                 if mobj.group(1) not in ids_in_page:
1784                     ids_in_page.append(mobj.group(1))
1785             video_ids.extend(ids_in_page)
1786
1787             if self._MORE_PAGES_INDICATOR not in page:
1788                 break
1789             pagenum = pagenum + 1
1790
1791         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1792
1793         for id in video_ids:
1794             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1795         return
1796
1797
1798 class YoutubeUserIE(InfoExtractor):
1799     """Information Extractor for YouTube users."""
1800
1801     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1802     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1803     _GDATA_PAGE_SIZE = 50
1804     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1805     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1806     IE_NAME = u'youtube:user'
1807
1808     def __init__(self, downloader=None):
1809         InfoExtractor.__init__(self, downloader)
1810
1811     def report_download_page(self, username, start_index):
1812         """Report attempt to download user page."""
1813         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1814                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1815
1816     def _real_extract(self, url):
1817         # Extract username
1818         mobj = re.match(self._VALID_URL, url)
1819         if mobj is None:
1820             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1821             return
1822
1823         username = mobj.group(1)
1824
1825         # Download video ids using YouTube Data API. Result size per
1826         # query is limited (currently to 50 videos) so we need to query
1827         # page by page until there are no video ids - it means we got
1828         # all of them.
1829
1830         video_ids = []
1831         pagenum = 0
1832
1833         while True:
1834             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1835             self.report_download_page(username, start_index)
1836
1837             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1838
1839             try:
1840                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1841             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1842                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1843                 return
1844
1845             # Extract video identifiers
1846             ids_in_page = []
1847
1848             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1849                 if mobj.group(1) not in ids_in_page:
1850                     ids_in_page.append(mobj.group(1))
1851
1852             video_ids.extend(ids_in_page)
1853
1854             # A little optimization - if current page is not
1855             # "full", ie. does not contain PAGE_SIZE video ids then
1856             # we can assume that this page is the last one - there
1857             # are no more ids on further pages - no need to query
1858             # again.
1859
1860             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1861                 break
1862
1863             pagenum += 1
1864
1865         all_ids_count = len(video_ids)
1866         playliststart = self._downloader.params.get('playliststart', 1) - 1
1867         playlistend = self._downloader.params.get('playlistend', -1)
1868
1869         if playlistend == -1:
1870             video_ids = video_ids[playliststart:]
1871         else:
1872             video_ids = video_ids[playliststart:playlistend]
1873
1874         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1875                 (username, all_ids_count, len(video_ids)))
1876
1877         for video_id in video_ids:
1878             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1879
1880
1881 class BlipTVUserIE(InfoExtractor):
1882     """Information Extractor for blip.tv users."""
1883
1884     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1885     _PAGE_SIZE = 12
1886     IE_NAME = u'blip.tv:user'
1887
1888     def __init__(self, downloader=None):
1889         InfoExtractor.__init__(self, downloader)
1890
1891     def report_download_page(self, username, pagenum):
1892         """Report attempt to download user page."""
1893         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1894                 (self.IE_NAME, username, pagenum))
1895
1896     def _real_extract(self, url):
1897         # Extract username
1898         mobj = re.match(self._VALID_URL, url)
1899         if mobj is None:
1900             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1901             return
1902
1903         username = mobj.group(1)
1904
1905         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1906
1907         request = compat_urllib_request.Request(url)
1908
1909         try:
1910             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1911             mobj = re.search(r'data-users-id="([^"]+)"', page)
1912             page_base = page_base % mobj.group(1)
1913         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1914             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1915             return
1916
1917
1918         # Download video ids using BlipTV Ajax calls. Result size per
1919         # query is limited (currently to 12 videos) so we need to query
1920         # page by page until there are no video ids - it means we got
1921         # all of them.
1922
1923         video_ids = []
1924         pagenum = 1
1925
1926         while True:
1927             self.report_download_page(username, pagenum)
1928             url = page_base + "&page=" + str(pagenum)
1929             request = compat_urllib_request.Request( url )
1930             try:
1931                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1932             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1934                 return
1935
1936             # Extract video identifiers
1937             ids_in_page = []
1938
1939             for mobj in re.finditer(r'href="/([^"]+)"', page):
1940                 if mobj.group(1) not in ids_in_page:
1941                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1942
1943             video_ids.extend(ids_in_page)
1944
1945             # A little optimization - if current page is not
1946             # "full", ie. does not contain PAGE_SIZE video ids then
1947             # we can assume that this page is the last one - there
1948             # are no more ids on further pages - no need to query
1949             # again.
1950
1951             if len(ids_in_page) < self._PAGE_SIZE:
1952                 break
1953
1954             pagenum += 1
1955
1956         all_ids_count = len(video_ids)
1957         playliststart = self._downloader.params.get('playliststart', 1) - 1
1958         playlistend = self._downloader.params.get('playlistend', -1)
1959
1960         if playlistend == -1:
1961             video_ids = video_ids[playliststart:]
1962         else:
1963             video_ids = video_ids[playliststart:playlistend]
1964
1965         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1966                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1967
1968         for video_id in video_ids:
1969             self._downloader.download([u'http://blip.tv/'+video_id])
1970
1971
1972 class DepositFilesIE(InfoExtractor):
1973     """Information extractor for depositfiles.com"""
1974
1975     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1976
1977     def report_download_webpage(self, file_id):
1978         """Report webpage download."""
1979         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1980
1981     def report_extraction(self, file_id):
1982         """Report information extraction."""
1983         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1984
1985     def _real_extract(self, url):
1986         file_id = url.split('/')[-1]
1987         # Rebuild url in english locale
1988         url = 'http://depositfiles.com/en/files/' + file_id
1989
1990         # Retrieve file webpage with 'Free download' button pressed
1991         free_download_indication = { 'gateway_result' : '1' }
1992         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1993         try:
1994             self.report_download_webpage(file_id)
1995             webpage = compat_urllib_request.urlopen(request).read()
1996         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1998             return
1999
2000         # Search for the real file URL
2001         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2002         if (mobj is None) or (mobj.group(1) is None):
2003             # Try to figure out reason of the error.
2004             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2005             if (mobj is not None) and (mobj.group(1) is not None):
2006                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2007                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2008             else:
2009                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2010             return
2011
2012         file_url = mobj.group(1)
2013         file_extension = os.path.splitext(file_url)[1][1:]
2014
2015         # Search for file title
2016         mobj = re.search(r'<b title="(.*?)">', webpage)
2017         if mobj is None:
2018             self._downloader.trouble(u'ERROR: unable to extract title')
2019             return
2020         file_title = mobj.group(1).decode('utf-8')
2021
2022         return [{
2023             'id':       file_id.decode('utf-8'),
2024             'url':      file_url.decode('utf-8'),
2025             'uploader': None,
2026             'upload_date':  None,
2027             'title':    file_title,
2028             'ext':      file_extension.decode('utf-8'),
2029         }]
2030
2031
2032 class FacebookIE(InfoExtractor):
2033     """Information Extractor for Facebook"""
2034
2035     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2036     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2037     _NETRC_MACHINE = 'facebook'
2038     IE_NAME = u'facebook'
2039
2040     def report_login(self):
2041         """Report attempt to log in."""
2042         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2043
2044     def _real_initialize(self):
2045         if self._downloader is None:
2046             return
2047
2048         useremail = None
2049         password = None
2050         downloader_params = self._downloader.params
2051
2052         # Attempt to use provided username and password or .netrc data
2053         if downloader_params.get('username', None) is not None:
2054             useremail = downloader_params['username']
2055             password = downloader_params['password']
2056         elif downloader_params.get('usenetrc', False):
2057             try:
2058                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2059                 if info is not None:
2060                     useremail = info[0]
2061                     password = info[2]
2062                 else:
2063                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2064             except (IOError, netrc.NetrcParseError) as err:
2065                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2066                 return
2067
2068         if useremail is None:
2069             return
2070
2071         # Log in
2072         login_form = {
2073             'email': useremail,
2074             'pass': password,
2075             'login': 'Log+In'
2076             }
2077         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2078         try:
2079             self.report_login()
2080             login_results = compat_urllib_request.urlopen(request).read()
2081             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2082                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2083                 return
2084         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2085             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2086             return
2087
2088     def _real_extract(self, url):
2089         mobj = re.match(self._VALID_URL, url)
2090         if mobj is None:
2091             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2092             return
2093         video_id = mobj.group('ID')
2094
2095         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2096         webpage = self._download_webpage(url, video_id)
2097
2098         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2099         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2100         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2101         if not m:
2102             raise ExtractorError(u'Cannot parse data')
2103         data = dict(json.loads(m.group(1)))
2104         params_raw = compat_urllib_parse.unquote(data['params'])
2105         params = json.loads(params_raw)
2106         video_url = params['hd_src']
2107         if not video_url:
2108             video_url = params['sd_src']
2109         if not video_url:
2110             raise ExtractorError(u'Cannot find video URL')
2111         video_duration = int(params['video_duration'])
2112
2113         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2114         if not m:
2115             raise ExtractorError(u'Cannot find title in webpage')
2116         video_title = unescapeHTML(m.group(1))
2117
2118         info = {
2119             'id': video_id,
2120             'title': video_title,
2121             'url': video_url,
2122             'ext': 'mp4',
2123             'duration': video_duration,
2124             'thumbnail': params['thumbnail_src'],
2125         }
2126         return [info]
2127
2128
2129 class BlipTVIE(InfoExtractor):
2130     """Information extractor for blip.tv"""
2131
2132     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2133     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2134     IE_NAME = u'blip.tv'
2135
2136     def report_extraction(self, file_id):
2137         """Report information extraction."""
2138         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2139
2140     def report_direct_download(self, title):
2141         """Report information extraction."""
2142         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2143
2144     def _real_extract(self, url):
2145         mobj = re.match(self._VALID_URL, url)
2146         if mobj is None:
2147             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2148             return
2149
2150         if '?' in url:
2151             cchar = '&'
2152         else:
2153             cchar = '?'
2154         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2155         request = compat_urllib_request.Request(json_url)
2156         request.add_header('User-Agent', 'iTunes/10.6.1')
2157         self.report_extraction(mobj.group(1))
2158         info = None
2159         try:
2160             urlh = compat_urllib_request.urlopen(request)
2161             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2162                 basename = url.split('/')[-1]
2163                 title,ext = os.path.splitext(basename)
2164                 title = title.decode('UTF-8')
2165                 ext = ext.replace('.', '')
2166                 self.report_direct_download(title)
2167                 info = {
2168                     'id': title,
2169                     'url': url,
2170                     'uploader': None,
2171                     'upload_date': None,
2172                     'title': title,
2173                     'ext': ext,
2174                     'urlhandle': urlh
2175                 }
2176         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2178         if info is None: # Regular URL
2179             try:
2180                 json_code_bytes = urlh.read()
2181                 json_code = json_code_bytes.decode('utf-8')
2182             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2183                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2184                 return
2185
2186             try:
2187                 json_data = json.loads(json_code)
2188                 if 'Post' in json_data:
2189                     data = json_data['Post']
2190                 else:
2191                     data = json_data
2192
2193                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2194                 video_url = data['media']['url']
2195                 umobj = re.match(self._URL_EXT, video_url)
2196                 if umobj is None:
2197                     raise ValueError('Can not determine filename extension')
2198                 ext = umobj.group(1)
2199
2200                 info = {
2201                     'id': data['item_id'],
2202                     'url': video_url,
2203                     'uploader': data['display_name'],
2204                     'upload_date': upload_date,
2205                     'title': data['title'],
2206                     'ext': ext,
2207                     'format': data['media']['mimeType'],
2208                     'thumbnail': data['thumbnailUrl'],
2209                     'description': data['description'],
2210                     'player_url': data['embedUrl'],
2211                     'user_agent': 'iTunes/10.6.1',
2212                 }
2213             except (ValueError,KeyError) as err:
2214                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2215                 return
2216
2217         return [info]
2218
2219
2220 class MyVideoIE(InfoExtractor):
2221     """Information Extractor for myvideo.de."""
2222
2223     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2224     IE_NAME = u'myvideo'
2225
2226     def __init__(self, downloader=None):
2227         InfoExtractor.__init__(self, downloader)
2228
2229     def report_extraction(self, video_id):
2230         """Report information extraction."""
2231         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2232
2233     def _real_extract(self,url):
2234         mobj = re.match(self._VALID_URL, url)
2235         if mobj is None:
2236             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2237             return
2238
2239         video_id = mobj.group(1)
2240
2241         # Get video webpage
2242         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2243         webpage = self._download_webpage(webpage_url, video_id)
2244
2245         self.report_extraction(video_id)
2246         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2247                  webpage)
2248         if mobj is None:
2249             self._downloader.trouble(u'ERROR: unable to extract media URL')
2250             return
2251         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2252
2253         mobj = re.search('<title>([^<]+)</title>', webpage)
2254         if mobj is None:
2255             self._downloader.trouble(u'ERROR: unable to extract title')
2256             return
2257
2258         video_title = mobj.group(1)
2259
2260         return [{
2261             'id':       video_id,
2262             'url':      video_url,
2263             'uploader': None,
2264             'upload_date':  None,
2265             'title':    video_title,
2266             'ext':      u'flv',
2267         }]
2268
2269 class ComedyCentralIE(InfoExtractor):
2270     """Information extractor for The Daily Show and Colbert Report """
2271
2272     # urls can be abbreviations like :thedailyshow or :colbert
2273     # urls for episodes like:
2274     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2275     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2276     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2277     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2278                       |(https?://)?(www\.)?
2279                           (?P<showname>thedailyshow|colbertnation)\.com/
2280                          (full-episodes/(?P<episode>.*)|
2281                           (?P<clip>
2282                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2283                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2284                      $"""
2285
2286     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2287
2288     _video_extensions = {
2289         '3500': 'mp4',
2290         '2200': 'mp4',
2291         '1700': 'mp4',
2292         '1200': 'mp4',
2293         '750': 'mp4',
2294         '400': 'mp4',
2295     }
2296     _video_dimensions = {
2297         '3500': '1280x720',
2298         '2200': '960x540',
2299         '1700': '768x432',
2300         '1200': '640x360',
2301         '750': '512x288',
2302         '400': '384x216',
2303     }
2304
2305     def suitable(self, url):
2306         """Receives a URL and returns True if suitable for this IE."""
2307         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2308
2309     def report_extraction(self, episode_id):
2310         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2311
2312     def report_config_download(self, episode_id, media_id):
2313         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2314
2315     def report_index_download(self, episode_id):
2316         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2317
2318     def _print_formats(self, formats):
2319         print('Available formats:')
2320         for x in formats:
2321             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2322
2323
2324     def _real_extract(self, url):
2325         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2326         if mobj is None:
2327             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2328             return
2329
2330         if mobj.group('shortname'):
2331             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2332                 url = u'http://www.thedailyshow.com/full-episodes/'
2333             else:
2334                 url = u'http://www.colbertnation.com/full-episodes/'
2335             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2336             assert mobj is not None
2337
2338         if mobj.group('clip'):
2339             if mobj.group('showname') == 'thedailyshow':
2340                 epTitle = mobj.group('tdstitle')
2341             else:
2342                 epTitle = mobj.group('cntitle')
2343             dlNewest = False
2344         else:
2345             dlNewest = not mobj.group('episode')
2346             if dlNewest:
2347                 epTitle = mobj.group('showname')
2348             else:
2349                 epTitle = mobj.group('episode')
2350
2351         req = compat_urllib_request.Request(url)
2352         self.report_extraction(epTitle)
2353         try:
2354             htmlHandle = compat_urllib_request.urlopen(req)
2355             html = htmlHandle.read()
2356             webpage = html.decode('utf-8')
2357         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2358             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2359             return
2360         if dlNewest:
2361             url = htmlHandle.geturl()
2362             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2363             if mobj is None:
2364                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2365                 return
2366             if mobj.group('episode') == '':
2367                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2368                 return
2369             epTitle = mobj.group('episode')
2370
2371         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2372
2373         if len(mMovieParams) == 0:
2374             # The Colbert Report embeds the information in a without
2375             # a URL prefix; so extract the alternate reference
2376             # and then add the URL prefix manually.
2377
2378             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2379             if len(altMovieParams) == 0:
2380                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2381                 return
2382             else:
2383                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2384
2385         uri = mMovieParams[0][1]
2386         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2387         self.report_index_download(epTitle)
2388         try:
2389             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2392             return
2393
2394         results = []
2395
2396         idoc = xml.etree.ElementTree.fromstring(indexXml)
2397         itemEls = idoc.findall('.//item')
2398         for partNum,itemEl in enumerate(itemEls):
2399             mediaId = itemEl.findall('./guid')[0].text
2400             shortMediaId = mediaId.split(':')[-1]
2401             showId = mediaId.split(':')[-2].replace('.com', '')
2402             officialTitle = itemEl.findall('./title')[0].text
2403             officialDate = itemEl.findall('./pubDate')[0].text
2404
2405             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2406                         compat_urllib_parse.urlencode({'uri': mediaId}))
2407             configReq = compat_urllib_request.Request(configUrl)
2408             self.report_config_download(epTitle, shortMediaId)
2409             try:
2410                 configXml = compat_urllib_request.urlopen(configReq).read()
2411             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2412                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2413                 return
2414
2415             cdoc = xml.etree.ElementTree.fromstring(configXml)
2416             turls = []
2417             for rendition in cdoc.findall('.//rendition'):
2418                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2419                 turls.append(finfo)
2420
2421             if len(turls) == 0:
2422                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2423                 continue
2424
2425             if self._downloader.params.get('listformats', None):
2426                 self._print_formats([i[0] for i in turls])
2427                 return
2428
2429             # For now, just pick the highest bitrate
2430             format,rtmp_video_url = turls[-1]
2431
2432             # Get the format arg from the arg stream
2433             req_format = self._downloader.params.get('format', None)
2434
2435             # Select format if we can find one
2436             for f,v in turls:
2437                 if f == req_format:
2438                     format, rtmp_video_url = f, v
2439                     break
2440
2441             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2442             if not m:
2443                 raise ExtractorError(u'Cannot transform RTMP url')
2444             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2445             video_url = base + m.group('finalid')
2446
2447             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2448             info = {
2449                 'id': shortMediaId,
2450                 'url': video_url,
2451                 'uploader': showId,
2452                 'upload_date': officialDate,
2453                 'title': effTitle,
2454                 'ext': 'mp4',
2455                 'format': format,
2456                 'thumbnail': None,
2457                 'description': officialTitle,
2458             }
2459             results.append(info)
2460
2461         return results
2462
2463
2464 class EscapistIE(InfoExtractor):
2465     """Information extractor for The Escapist """
2466
2467     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2468     IE_NAME = u'escapist'
2469
2470     def report_extraction(self, showName):
2471         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2472
2473     def report_config_download(self, showName):
2474         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2475
2476     def _real_extract(self, url):
2477         mobj = re.match(self._VALID_URL, url)
2478         if mobj is None:
2479             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2480             return
2481         showName = mobj.group('showname')
2482         videoId = mobj.group('episode')
2483
2484         self.report_extraction(showName)
2485         try:
2486             webPage = compat_urllib_request.urlopen(url)
2487             webPageBytes = webPage.read()
2488             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2489             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2492             return
2493
2494         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2495         description = unescapeHTML(descMatch.group(1))
2496         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2497         imgUrl = unescapeHTML(imgMatch.group(1))
2498         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2499         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2500         configUrlMatch = re.search('config=(.*)$', playerUrl)
2501         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2502
2503         self.report_config_download(showName)
2504         try:
2505             configJSON = compat_urllib_request.urlopen(configUrl)
2506             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2507             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2508         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2509             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2510             return
2511
2512         # Technically, it's JavaScript, not JSON
2513         configJSON = configJSON.replace("'", '"')
2514
2515         try:
2516             config = json.loads(configJSON)
2517         except (ValueError,) as err:
2518             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2519             return
2520
2521         playlist = config['playlist']
2522         videoUrl = playlist[1]['url']
2523
2524         info = {
2525             'id': videoId,
2526             'url': videoUrl,
2527             'uploader': showName,
2528             'upload_date': None,
2529             'title': showName,
2530             'ext': 'flv',
2531             'thumbnail': imgUrl,
2532             'description': description,
2533             'player_url': playerUrl,
2534         }
2535
2536         return [info]
2537
2538 class CollegeHumorIE(InfoExtractor):
2539     """Information extractor for collegehumor.com"""
2540
2541     _WORKING = False
2542     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2543     IE_NAME = u'collegehumor'
2544
2545     def report_manifest(self, video_id):
2546         """Report information extraction."""
2547         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2548
2549     def report_extraction(self, video_id):
2550         """Report information extraction."""
2551         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2552
2553     def _real_extract(self, url):
2554         mobj = re.match(self._VALID_URL, url)
2555         if mobj is None:
2556             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2557             return
2558         video_id = mobj.group('videoid')
2559
2560         info = {
2561             'id': video_id,
2562             'uploader': None,
2563             'upload_date': None,
2564         }
2565
2566         self.report_extraction(video_id)
2567         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2568         try:
2569             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2570         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2572             return
2573
2574         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2575         try:
2576             videoNode = mdoc.findall('./video')[0]
2577             info['description'] = videoNode.findall('./description')[0].text
2578             info['title'] = videoNode.findall('./caption')[0].text
2579             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2580             manifest_url = videoNode.findall('./file')[0].text
2581         except IndexError:
2582             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2583             return
2584
2585         manifest_url += '?hdcore=2.10.3'
2586         self.report_manifest(video_id)
2587         try:
2588             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2589         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2590             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2591             return
2592
2593         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2594         try:
2595             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2596             node_id = media_node.attrib['url']
2597             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2598         except IndexError as err:
2599             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2600             return
2601
2602         url_pr = compat_urllib_parse_urlparse(manifest_url)
2603         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2604
2605         info['url'] = url
2606         info['ext'] = 'f4f'
2607         return [info]
2608
2609
2610 class XVideosIE(InfoExtractor):
2611     """Information extractor for xvideos.com"""
2612
2613     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2614     IE_NAME = u'xvideos'
2615
2616     def report_extraction(self, video_id):
2617         """Report information extraction."""
2618         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2619
2620     def _real_extract(self, url):
2621         mobj = re.match(self._VALID_URL, url)
2622         if mobj is None:
2623             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2624             return
2625         video_id = mobj.group(1)
2626
2627         webpage = self._download_webpage(url, video_id)
2628
2629         self.report_extraction(video_id)
2630
2631
2632         # Extract video URL
2633         mobj = re.search(r'flv_url=(.+?)&', webpage)
2634         if mobj is None:
2635             self._downloader.trouble(u'ERROR: unable to extract video url')
2636             return
2637         video_url = compat_urllib_parse.unquote(mobj.group(1))
2638
2639
2640         # Extract title
2641         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2642         if mobj is None:
2643             self._downloader.trouble(u'ERROR: unable to extract video title')
2644             return
2645         video_title = mobj.group(1)
2646
2647
2648         # Extract video thumbnail
2649         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2650         if mobj is None:
2651             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2652             return
2653         video_thumbnail = mobj.group(0)
2654
2655         info = {
2656             'id': video_id,
2657             'url': video_url,
2658             'uploader': None,
2659             'upload_date': None,
2660             'title': video_title,
2661             'ext': 'flv',
2662             'thumbnail': video_thumbnail,
2663             'description': None,
2664         }
2665
2666         return [info]
2667
2668
2669 class SoundcloudIE(InfoExtractor):
2670     """Information extractor for soundcloud.com
2671        To access the media, the uid of the song and a stream token
2672        must be extracted from the page source and the script must make
2673        a request to media.soundcloud.com/crossdomain.xml. Then
2674        the media can be grabbed by requesting from an url composed
2675        of the stream token and uid
2676      """
2677
2678     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2679     IE_NAME = u'soundcloud'
2680
2681     def __init__(self, downloader=None):
2682         InfoExtractor.__init__(self, downloader)
2683
2684     def report_resolve(self, video_id):
2685         """Report information extraction."""
2686         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2687
2688     def report_extraction(self, video_id):
2689         """Report information extraction."""
2690         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2691
2692     def _real_extract(self, url):
2693         mobj = re.match(self._VALID_URL, url)
2694         if mobj is None:
2695             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2696             return
2697
2698         # extract uploader (which is in the url)
2699         uploader = mobj.group(1)
2700         # extract simple title (uploader + slug of song title)
2701         slug_title =  mobj.group(2)
2702         simple_title = uploader + u'-' + slug_title
2703
2704         self.report_resolve('%s/%s' % (uploader, slug_title))
2705
2706         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2707         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2708         request = compat_urllib_request.Request(resolv_url)
2709         try:
2710             info_json_bytes = compat_urllib_request.urlopen(request).read()
2711             info_json = info_json_bytes.decode('utf-8')
2712         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2713             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2714             return
2715
2716         info = json.loads(info_json)
2717         video_id = info['id']
2718         self.report_extraction('%s/%s' % (uploader, slug_title))
2719
2720         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2721         request = compat_urllib_request.Request(streams_url)
2722         try:
2723             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2724             stream_json = stream_json_bytes.decode('utf-8')
2725         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2726             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2727             return
2728
2729         streams = json.loads(stream_json)
2730         mediaURL = streams['http_mp3_128_url']
2731
2732         return [{
2733             'id':       info['id'],
2734             'url':      mediaURL,
2735             'uploader': info['user']['username'],
2736             'upload_date':  info['created_at'],
2737             'title':    info['title'],
2738             'ext':      u'mp3',
2739             'description': info['description'],
2740         }]
2741
2742
2743 class InfoQIE(InfoExtractor):
2744     """Information extractor for infoq.com"""
2745     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2746
2747     def report_extraction(self, video_id):
2748         """Report information extraction."""
2749         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2750
2751     def _real_extract(self, url):
2752         mobj = re.match(self._VALID_URL, url)
2753         if mobj is None:
2754             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2755             return
2756
2757         webpage = self._download_webpage(url, video_id=url)
2758         self.report_extraction(url)
2759
2760         # Extract video URL
2761         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2762         if mobj is None:
2763             self._downloader.trouble(u'ERROR: unable to extract video url')
2764             return
2765         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2766         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2767
2768         # Extract title
2769         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2770         if mobj is None:
2771             self._downloader.trouble(u'ERROR: unable to extract video title')
2772             return
2773         video_title = mobj.group(1)
2774
2775         # Extract description
2776         video_description = u'No description available.'
2777         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2778         if mobj is not None:
2779             video_description = mobj.group(1)
2780
2781         video_filename = video_url.split('/')[-1]
2782         video_id, extension = video_filename.split('.')
2783
2784         info = {
2785             'id': video_id,
2786             'url': video_url,
2787             'uploader': None,
2788             'upload_date': None,
2789             'title': video_title,
2790             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2791             'thumbnail': None,
2792             'description': video_description,
2793         }
2794
2795         return [info]
2796
2797 class MixcloudIE(InfoExtractor):
2798     """Information extractor for www.mixcloud.com"""
2799
2800     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2801     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2802     IE_NAME = u'mixcloud'
2803
2804     def __init__(self, downloader=None):
2805         InfoExtractor.__init__(self, downloader)
2806
2807     def report_download_json(self, file_id):
2808         """Report JSON download."""
2809         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2810
2811     def report_extraction(self, file_id):
2812         """Report information extraction."""
2813         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2814
2815     def get_urls(self, jsonData, fmt, bitrate='best'):
2816         """Get urls from 'audio_formats' section in json"""
2817         file_url = None
2818         try:
2819             bitrate_list = jsonData[fmt]
2820             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2821                 bitrate = max(bitrate_list) # select highest
2822
2823             url_list = jsonData[fmt][bitrate]
2824         except TypeError: # we have no bitrate info.
2825             url_list = jsonData[fmt]
2826         return url_list
2827
2828     def check_urls(self, url_list):
2829         """Returns 1st active url from list"""
2830         for url in url_list:
2831             try:
2832                 compat_urllib_request.urlopen(url)
2833                 return url
2834             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2835                 url = None
2836
2837         return None
2838
2839     def _print_formats(self, formats):
2840         print('Available formats:')
2841         for fmt in formats.keys():
2842             for b in formats[fmt]:
2843                 try:
2844                     ext = formats[fmt][b][0]
2845                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2846                 except TypeError: # we have no bitrate info
2847                     ext = formats[fmt][0]
2848                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2849                     break
2850
2851     def _real_extract(self, url):
2852         mobj = re.match(self._VALID_URL, url)
2853         if mobj is None:
2854             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2855             return
2856         # extract uploader & filename from url
2857         uploader = mobj.group(1).decode('utf-8')
2858         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2859
2860         # construct API request
2861         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2862         # retrieve .json file with links to files
2863         request = compat_urllib_request.Request(file_url)
2864         try:
2865             self.report_download_json(file_url)
2866             jsonData = compat_urllib_request.urlopen(request).read()
2867         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2868             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2869             return
2870
2871         # parse JSON
2872         json_data = json.loads(jsonData)
2873         player_url = json_data['player_swf_url']
2874         formats = dict(json_data['audio_formats'])
2875
2876         req_format = self._downloader.params.get('format', None)
2877         bitrate = None
2878
2879         if self._downloader.params.get('listformats', None):
2880             self._print_formats(formats)
2881             return
2882
2883         if req_format is None or req_format == 'best':
2884             for format_param in formats.keys():
2885                 url_list = self.get_urls(formats, format_param)
2886                 # check urls
2887                 file_url = self.check_urls(url_list)
2888                 if file_url is not None:
2889                     break # got it!
2890         else:
2891             if req_format not in formats:
2892                 self._downloader.trouble(u'ERROR: format is not available')
2893                 return
2894
2895             url_list = self.get_urls(formats, req_format)
2896             file_url = self.check_urls(url_list)
2897             format_param = req_format
2898
2899         return [{
2900             'id': file_id.decode('utf-8'),
2901             'url': file_url.decode('utf-8'),
2902             'uploader': uploader.decode('utf-8'),
2903             'upload_date': None,
2904             'title': json_data['name'],
2905             'ext': file_url.split('.')[-1].decode('utf-8'),
2906             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2907             'thumbnail': json_data['thumbnail_url'],
2908             'description': json_data['description'],
2909             'player_url': player_url.decode('utf-8'),
2910         }]
2911
2912 class StanfordOpenClassroomIE(InfoExtractor):
2913     """Information extractor for Stanford's Open ClassRoom"""
2914
2915     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2916     IE_NAME = u'stanfordoc'
2917
2918     def report_download_webpage(self, objid):
2919         """Report information extraction."""
2920         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2921
2922     def report_extraction(self, video_id):
2923         """Report information extraction."""
2924         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2925
2926     def _real_extract(self, url):
2927         mobj = re.match(self._VALID_URL, url)
2928         if mobj is None:
2929             raise ExtractorError(u'Invalid URL: %s' % url)
2930
2931         if mobj.group('course') and mobj.group('video'): # A specific video
2932             course = mobj.group('course')
2933             video = mobj.group('video')
2934             info = {
2935                 'id': course + '_' + video,
2936                 'uploader': None,
2937                 'upload_date': None,
2938             }
2939
2940             self.report_extraction(info['id'])
2941             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2942             xmlUrl = baseUrl + video + '.xml'
2943             try:
2944                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2945             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2946                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2947                 return
2948             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2949             try:
2950                 info['title'] = mdoc.findall('./title')[0].text
2951                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2952             except IndexError:
2953                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2954                 return
2955             info['ext'] = info['url'].rpartition('.')[2]
2956             return [info]
2957         elif mobj.group('course'): # A course page
2958             course = mobj.group('course')
2959             info = {
2960                 'id': course,
2961                 'type': 'playlist',
2962                 'uploader': None,
2963                 'upload_date': None,
2964             }
2965
2966             coursepage = self._download_webpage(url, info['id'],
2967                                         note='Downloading course info page',
2968                                         errnote='Unable to download course info page')
2969
2970             m = re.search('<h1>([^<]+)</h1>', coursepage)
2971             if m:
2972                 info['title'] = unescapeHTML(m.group(1))
2973             else:
2974                 info['title'] = info['id']
2975
2976             m = re.search('<description>([^<]+)</description>', coursepage)
2977             if m:
2978                 info['description'] = unescapeHTML(m.group(1))
2979
2980             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2981             info['list'] = [
2982                 {
2983                     'type': 'reference',
2984                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2985                 }
2986                     for vpage in links]
2987             results = []
2988             for entry in info['list']:
2989                 assert entry['type'] == 'reference'
2990                 results += self.extract(entry['url'])
2991             return results
2992         else: # Root page
2993             info = {
2994                 'id': 'Stanford OpenClassroom',
2995                 'type': 'playlist',
2996                 'uploader': None,
2997                 'upload_date': None,
2998             }
2999
3000             self.report_download_webpage(info['id'])
3001             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3002             try:
3003                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3004             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3005                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3006                 return
3007
3008             info['title'] = info['id']
3009
3010             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3011             info['list'] = [
3012                 {
3013                     'type': 'reference',
3014                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3015                 }
3016                     for cpage in links]
3017
3018             results = []
3019             for entry in info['list']:
3020                 assert entry['type'] == 'reference'
3021                 results += self.extract(entry['url'])
3022             return results
3023
3024 class MTVIE(InfoExtractor):
3025     """Information extractor for MTV.com"""
3026
3027     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3028     IE_NAME = u'mtv'
3029
3030     def report_extraction(self, video_id):
3031         """Report information extraction."""
3032         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3033
3034     def _real_extract(self, url):
3035         mobj = re.match(self._VALID_URL, url)
3036         if mobj is None:
3037             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3038             return
3039         if not mobj.group('proto'):
3040             url = 'http://' + url
3041         video_id = mobj.group('videoid')
3042
3043         webpage = self._download_webpage(url, video_id)
3044
3045         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3046         if mobj is None:
3047             self._downloader.trouble(u'ERROR: unable to extract song name')
3048             return
3049         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3050         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3051         if mobj is None:
3052             self._downloader.trouble(u'ERROR: unable to extract performer')
3053             return
3054         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3055         video_title = performer + ' - ' + song_name
3056
3057         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3058         if mobj is None:
3059             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3060             return
3061         mtvn_uri = mobj.group(1)
3062
3063         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3064         if mobj is None:
3065             self._downloader.trouble(u'ERROR: unable to extract content id')
3066             return
3067         content_id = mobj.group(1)
3068
3069         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3070         self.report_extraction(video_id)
3071         request = compat_urllib_request.Request(videogen_url)
3072         try:
3073             metadataXml = compat_urllib_request.urlopen(request).read()
3074         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3075             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3076             return
3077
3078         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3079         renditions = mdoc.findall('.//rendition')
3080
3081         # For now, always pick the highest quality.
3082         rendition = renditions[-1]
3083
3084         try:
3085             _,_,ext = rendition.attrib['type'].partition('/')
3086             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3087             video_url = rendition.find('./src').text
3088         except KeyError:
3089             self._downloader.trouble('Invalid rendition field.')
3090             return
3091
3092         info = {
3093             'id': video_id,
3094             'url': video_url,
3095             'uploader': performer,
3096             'upload_date': None,
3097             'title': video_title,
3098             'ext': ext,
3099             'format': format,
3100         }
3101
3102         return [info]
3103
3104
3105 class YoukuIE(InfoExtractor):
3106     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3107
3108     def report_download_webpage(self, file_id):
3109         """Report webpage download."""
3110         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3111
3112     def report_extraction(self, file_id):
3113         """Report information extraction."""
3114         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3115
3116     def _gen_sid(self):
3117         nowTime = int(time.time() * 1000)
3118         random1 = random.randint(1000,1998)
3119         random2 = random.randint(1000,9999)
3120
3121         return "%d%d%d" %(nowTime,random1,random2)
3122
3123     def _get_file_ID_mix_string(self, seed):
3124         mixed = []
3125         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3126         seed = float(seed)
3127         for i in range(len(source)):
3128             seed  =  (seed * 211 + 30031 ) % 65536
3129             index  =  math.floor(seed / 65536 * len(source) )
3130             mixed.append(source[int(index)])
3131             source.remove(source[int(index)])
3132         #return ''.join(mixed)
3133         return mixed
3134
3135     def _get_file_id(self, fileId, seed):
3136         mixed = self._get_file_ID_mix_string(seed)
3137         ids = fileId.split('*')
3138         realId = []
3139         for ch in ids:
3140             if ch:
3141                 realId.append(mixed[int(ch)])
3142         return ''.join(realId)
3143
3144     def _real_extract(self, url):
3145         mobj = re.match(self._VALID_URL, url)
3146         if mobj is None:
3147             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3148             return
3149         video_id = mobj.group('ID')
3150
3151         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3152
3153         request = compat_urllib_request.Request(info_url, None, std_headers)
3154         try:
3155             self.report_download_webpage(video_id)
3156             jsondata = compat_urllib_request.urlopen(request).read()
3157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3158             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3159             return
3160
3161         self.report_extraction(video_id)
3162         try:
3163             jsonstr = jsondata.decode('utf-8')
3164             config = json.loads(jsonstr)
3165
3166             video_title =  config['data'][0]['title']
3167             seed = config['data'][0]['seed']
3168
3169             format = self._downloader.params.get('format', None)
3170             supported_format = list(config['data'][0]['streamfileids'].keys())
3171
3172             if format is None or format == 'best':
3173                 if 'hd2' in supported_format:
3174                     format = 'hd2'
3175                 else:
3176                     format = 'flv'
3177                 ext = u'flv'
3178             elif format == 'worst':
3179                 format = 'mp4'
3180                 ext = u'mp4'
3181             else:
3182                 format = 'flv'
3183                 ext = u'flv'
3184
3185
3186             fileid = config['data'][0]['streamfileids'][format]
3187             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3188         except (UnicodeDecodeError, ValueError, KeyError):
3189             self._downloader.trouble(u'ERROR: unable to extract info section')
3190             return
3191
3192         files_info=[]
3193         sid = self._gen_sid()
3194         fileid = self._get_file_id(fileid, seed)
3195
3196         #column 8,9 of fileid represent the segment number
3197         #fileid[7:9] should be changed
3198         for index, key in enumerate(keys):
3199
3200             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3201             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3202
3203             info = {
3204                 'id': '%s_part%02d' % (video_id, index),
3205                 'url': download_url,
3206                 'uploader': None,
3207                 'upload_date': None,
3208                 'title': video_title,
3209                 'ext': ext,
3210             }
3211             files_info.append(info)
3212
3213         return files_info
3214
3215
3216 class XNXXIE(InfoExtractor):
3217     """Information extractor for xnxx.com"""
3218
3219     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3220     IE_NAME = u'xnxx'
3221     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3222     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3223     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3224
3225     def report_webpage(self, video_id):
3226         """Report information extraction"""
3227         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3228
3229     def report_extraction(self, video_id):
3230         """Report information extraction"""
3231         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3232
3233     def _real_extract(self, url):
3234         mobj = re.match(self._VALID_URL, url)
3235         if mobj is None:
3236             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3237             return
3238         video_id = mobj.group(1)
3239
3240         self.report_webpage(video_id)
3241
3242         # Get webpage content
3243         try:
3244             webpage_bytes = compat_urllib_request.urlopen(url).read()
3245             webpage = webpage_bytes.decode('utf-8')
3246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3247             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3248             return
3249
3250         result = re.search(self.VIDEO_URL_RE, webpage)
3251         if result is None:
3252             self._downloader.trouble(u'ERROR: unable to extract video url')
3253             return
3254         video_url = compat_urllib_parse.unquote(result.group(1))
3255
3256         result = re.search(self.VIDEO_TITLE_RE, webpage)
3257         if result is None:
3258             self._downloader.trouble(u'ERROR: unable to extract video title')
3259             return
3260         video_title = result.group(1)
3261
3262         result = re.search(self.VIDEO_THUMB_RE, webpage)
3263         if result is None:
3264             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3265             return
3266         video_thumbnail = result.group(1)
3267
3268         return [{
3269             'id': video_id,
3270             'url': video_url,
3271             'uploader': None,
3272             'upload_date': None,
3273             'title': video_title,
3274             'ext': 'flv',
3275             'thumbnail': video_thumbnail,
3276             'description': None,
3277         }]
3278
3279
3280 class GooglePlusIE(InfoExtractor):
3281     """Information extractor for plus.google.com."""
3282
3283     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3284     IE_NAME = u'plus.google'
3285
3286     def __init__(self, downloader=None):
3287         InfoExtractor.__init__(self, downloader)
3288
3289     def report_extract_entry(self, url):
3290         """Report downloading extry"""
3291         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3292
3293     def report_date(self, upload_date):
3294         """Report downloading extry"""
3295         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3296
3297     def report_uploader(self, uploader):
3298         """Report downloading extry"""
3299         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3300
3301     def report_title(self, video_title):
3302         """Report downloading extry"""
3303         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3304
3305     def report_extract_vid_page(self, video_page):
3306         """Report information extraction."""
3307         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3308
3309     def _real_extract(self, url):
3310         # Extract id from URL
3311         mobj = re.match(self._VALID_URL, url)
3312         if mobj is None:
3313             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3314             return
3315
3316         post_url = mobj.group(0)
3317         video_id = mobj.group(1)
3318
3319         video_extension = 'flv'
3320
3321         # Step 1, Retrieve post webpage to extract further information
3322         self.report_extract_entry(post_url)
3323         request = compat_urllib_request.Request(post_url)
3324         try:
3325             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3328             return
3329
3330         # Extract update date
3331         upload_date = None
3332         pattern = 'title="Timestamp">(.*?)</a>'
3333         mobj = re.search(pattern, webpage)
3334         if mobj:
3335             upload_date = mobj.group(1)
3336             # Convert timestring to a format suitable for filename
3337             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3338             upload_date = upload_date.strftime('%Y%m%d')
3339         self.report_date(upload_date)
3340
3341         # Extract uploader
3342         uploader = None
3343         pattern = r'rel\="author".*?>(.*?)</a>'
3344         mobj = re.search(pattern, webpage)
3345         if mobj:
3346             uploader = mobj.group(1)
3347         self.report_uploader(uploader)
3348
3349         # Extract title
3350         # Get the first line for title
3351         video_title = u'NA'
3352         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3353         mobj = re.search(pattern, webpage)
3354         if mobj:
3355             video_title = mobj.group(1)
3356         self.report_title(video_title)
3357
3358         # Step 2, Stimulate clicking the image box to launch video
3359         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3360         mobj = re.search(pattern, webpage)
3361         if mobj is None:
3362             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3363
3364         video_page = mobj.group(1)
3365         request = compat_urllib_request.Request(video_page)
3366         try:
3367             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3370             return
3371         self.report_extract_vid_page(video_page)
3372
3373
3374         # Extract video links on video page
3375         """Extract video links of all sizes"""
3376         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3377         mobj = re.findall(pattern, webpage)
3378         if len(mobj) == 0:
3379             self._downloader.trouble(u'ERROR: unable to extract video links')
3380
3381         # Sort in resolution
3382         links = sorted(mobj)
3383
3384         # Choose the lowest of the sort, i.e. highest resolution
3385         video_url = links[-1]
3386         # Only get the url. The resolution part in the tuple has no use anymore
3387         video_url = video_url[-1]
3388         # Treat escaped \u0026 style hex
3389         try:
3390             video_url = video_url.decode("unicode_escape")
3391         except AttributeError: # Python 3
3392             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3393
3394
3395         return [{
3396             'id':       video_id,
3397             'url':      video_url,
3398             'uploader': uploader,
3399             'upload_date':  upload_date,
3400             'title':    video_title,
3401             'ext':      video_extension,
3402         }]
3403
3404 class NBAIE(InfoExtractor):
3405     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3406     IE_NAME = u'nba'
3407
3408     def _real_extract(self, url):
3409         mobj = re.match(self._VALID_URL, url)
3410         if mobj is None:
3411             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3412             return
3413
3414         video_id = mobj.group(1)
3415         if video_id.endswith('/index.html'):
3416             video_id = video_id[:-len('/index.html')]
3417
3418         webpage = self._download_webpage(url, video_id)
3419
3420         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3421         def _findProp(rexp, default=None):
3422             m = re.search(rexp, webpage)
3423             if m:
3424                 return unescapeHTML(m.group(1))
3425             else:
3426                 return default
3427
3428         shortened_video_id = video_id.rpartition('/')[2]
3429         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3430         info = {
3431             'id': shortened_video_id,
3432             'url': video_url,
3433             'ext': 'mp4',
3434             'title': title,
3435             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3436             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3437         }
3438         return [info]
3439
3440 class JustinTVIE(InfoExtractor):
3441     """Information extractor for justin.tv and twitch.tv"""
3442     # TODO: One broadcast may be split into multiple videos. The key
3443     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3444     # starts at 1 and increases. Can we treat all parts as one video?
3445
3446     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3447         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3448     _JUSTIN_PAGE_LIMIT = 100
3449     IE_NAME = u'justin.tv'
3450
3451     def report_extraction(self, file_id):
3452         """Report information extraction."""
3453         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3454
3455     def report_download_page(self, channel, offset):
3456         """Report attempt to download a single page of videos."""
3457         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3458                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3459
3460     # Return count of items, list of *valid* items
3461     def _parse_page(self, url):
3462         try:
3463             urlh = compat_urllib_request.urlopen(url)
3464             webpage_bytes = urlh.read()
3465             webpage = webpage_bytes.decode('utf-8', 'ignore')
3466         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3467             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3468             return
3469
3470         response = json.loads(webpage)
3471         if type(response) != list:
3472             error_text = response.get('error', 'unknown error')
3473             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3474             return
3475         info = []
3476         for clip in response:
3477             video_url = clip['video_file_url']
3478             if video_url:
3479                 video_extension = os.path.splitext(video_url)[1][1:]
3480                 video_date = re.sub('-', '', clip['start_time'][:10])
3481                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3482                 video_id = clip['id']
3483                 video_title = clip.get('title', video_id)
3484                 info.append({
3485                     'id': video_id,
3486                     'url': video_url,
3487                     'title': video_title,
3488                     'uploader': clip.get('channel_name', video_uploader_id),
3489                     'uploader_id': video_uploader_id,
3490                     'upload_date': video_date,
3491                     'ext': video_extension,
3492                 })
3493         return (len(response), info)
3494
3495     def _real_extract(self, url):
3496         mobj = re.match(self._VALID_URL, url)
3497         if mobj is None:
3498             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3499             return
3500
3501         api = 'http://api.justin.tv'
3502         video_id = mobj.group(mobj.lastindex)
3503         paged = False
3504         if mobj.lastindex == 1:
3505             paged = True
3506             api += '/channel/archives/%s.json'
3507         else:
3508             api += '/broadcast/by_archive/%s.json'
3509         api = api % (video_id,)
3510
3511         self.report_extraction(video_id)
3512
3513         info = []
3514         offset = 0
3515         limit = self._JUSTIN_PAGE_LIMIT
3516         while True:
3517             if paged:
3518                 self.report_download_page(video_id, offset)
3519             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3520             page_count, page_info = self._parse_page(page_url)
3521             info.extend(page_info)
3522             if not paged or page_count != limit:
3523                 break
3524             offset += limit
3525         return info
3526
3527 class FunnyOrDieIE(InfoExtractor):
3528     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3529
3530     def _real_extract(self, url):
3531         mobj = re.match(self._VALID_URL, url)
3532         if mobj is None:
3533             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3534             return
3535
3536         video_id = mobj.group('id')
3537         webpage = self._download_webpage(url, video_id)
3538
3539         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3540         if not m:
3541             self._downloader.trouble(u'ERROR: unable to find video information')
3542         video_url = unescapeHTML(m.group('url'))
3543
3544         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3545         if not m:
3546             self._downloader.trouble(u'Cannot find video title')
3547         title = unescapeHTML(m.group('title'))
3548
3549         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3550         if m:
3551             desc = unescapeHTML(m.group('desc'))
3552         else:
3553             desc = None
3554
3555         info = {
3556             'id': video_id,
3557             'url': video_url,
3558             'ext': 'mp4',
3559             'title': title,
3560             'description': desc,
3561         }
3562         return [info]
3563
3564 class TweetReelIE(InfoExtractor):
3565     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3566
3567     def _real_extract(self, url):
3568         mobj = re.match(self._VALID_URL, url)
3569         if mobj is None:
3570             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3571             return
3572
3573         video_id = mobj.group('id')
3574         webpage = self._download_webpage(url, video_id)
3575
3576         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3577         if not m:
3578             self._downloader.trouble(u'ERROR: Cannot find status ID')
3579         status_id = m.group(1)
3580
3581         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3582         if not m:
3583             self._downloader.trouble(u'WARNING: Cannot find description')
3584         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3585
3586         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3587         if not m:
3588             self._downloader.trouble(u'ERROR: Cannot find uploader')
3589         uploader = unescapeHTML(m.group('uploader'))
3590         uploader_id = unescapeHTML(m.group('uploader_id'))
3591
3592         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3593         if not m:
3594             self._downloader.trouble(u'ERROR: Cannot find upload date')
3595         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3596
3597         title = desc
3598         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3599
3600         info = {
3601             'id': video_id,
3602             'url': video_url,
3603             'ext': 'mov',
3604             'title': title,
3605             'description': desc,
3606             'uploader': uploader,
3607             'uploader_id': uploader_id,
3608             'internal_id': status_id,
3609             'upload_date': upload_date
3610         }
3611         return [info]
3612         
3613 class SteamIE(InfoExtractor):
3614     _VALID_URL = r"""http://store.steampowered.com/ 
3615                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3616                 (?P<gameID>\d+)/?
3617                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3618                 """
3619
3620     def suitable(self, url):
3621         """Receives a URL and returns True if suitable for this IE."""
3622         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3623
3624     def _real_extract(self, url):
3625         m = re.match(self._VALID_URL, url, re.VERBOSE)
3626         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3627         gameID = m.group('gameID')
3628         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3629         webpage = self._download_webpage(videourl, gameID)
3630         mweb = re.finditer(urlRE, webpage)
3631         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3632         titles = re.finditer(namesRE, webpage)
3633         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3634         thumbs = re.finditer(thumbsRE, webpage)
3635         videos = []
3636         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3637             video_id = vid.group('videoID')
3638             title = vtitle.group('videoName')
3639             video_url = vid.group('videoURL')
3640             video_thumb = thumb.group('thumbnail')
3641             if not video_url:
3642                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3643             info = {
3644                 'id':video_id,
3645                 'url':video_url,
3646                 'ext': 'flv',
3647                 'title': unescapeHTML(title),
3648                 'thumbnail': video_thumb
3649                   }
3650             videos.append(info)
3651         return videos
3652
3653 class UstreamIE(InfoExtractor):
3654     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3655     IE_NAME = u'ustream'
3656
3657     def _real_extract(self, url):
3658         m = re.match(self._VALID_URL, url)
3659         video_id = m.group('videoID')
3660         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3661         webpage = self._download_webpage(url, video_id)
3662         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3663         title = m.group('title')
3664         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3665         uploader = m.group('uploader')
3666         info = {
3667                 'id':video_id,
3668                 'url':video_url,
3669                 'ext': 'flv',
3670                 'title': title,
3671                 'uploader': uploader
3672                   }
3673         return [info]
3674
3675 class RBMARadioIE(InfoExtractor):
3676     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3677
3678     def _real_extract(self, url):
3679         m = re.match(self._VALID_URL, url)
3680         video_id = m.group('videoID')
3681
3682         webpage = self._download_webpage(url, video_id)
3683         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3684         if not m:
3685             raise ExtractorError(u'Cannot find metadata')
3686         json_data = m.group(1)
3687
3688         try:
3689             data = json.loads(json_data)
3690         except ValueError as e:
3691             raise ExtractorError(u'Invalid JSON: ' + str(e))
3692
3693         video_url = data['akamai_url'] + '&cbr=256'
3694         url_parts = compat_urllib_parse_urlparse(video_url)
3695         video_ext = url_parts.path.rpartition('.')[2]
3696         info = {
3697                 'id': video_id,
3698                 'url': video_url,
3699                 'ext': video_ext,
3700                 'title': data['title'],
3701                 'description': data.get('teaser_text'),
3702                 'location': data.get('country_of_origin'),
3703                 'uploader': data.get('host', {}).get('name'),
3704                 'uploader_id': data.get('host', {}).get('slug'),
3705                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3706                 'duration': data.get('duration'),
3707         }
3708         return [info]
3709
3710
3711 class YouPornIE(InfoExtractor):
3712     """Information extractor for youporn.com."""
3713     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3714    
3715     def _print_formats(self, formats):
3716         """Print all available formats"""
3717         print(u'Available formats:')
3718         print(u'ext\t\tformat')
3719         print(u'---------------------------------')
3720         for format in formats:
3721             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3722
3723     def _specific(self, req_format, formats):
3724         for x in formats:
3725             if(x["format"]==req_format):
3726                 return x
3727         return None
3728
3729     def _real_extract(self, url):
3730         mobj = re.match(self._VALID_URL, url)
3731         if mobj is None:
3732             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3733             return
3734
3735         video_id = mobj.group('videoid')
3736
3737         req = compat_urllib_request.Request(url)
3738         req.add_header('Cookie', 'age_verified=1')
3739         webpage = self._download_webpage(req, video_id)
3740
3741         # Get the video title
3742         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3743         if result is None:
3744             raise ExtractorError(u'Unable to extract video title')
3745         video_title = result.group('title').strip()
3746
3747         # Get the video date
3748         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3749         if result is None:
3750             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3751             upload_date = None
3752         else:
3753             upload_date = result.group('date').strip()
3754
3755         # Get the video uploader
3756         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3757         if result is None:
3758             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3759             video_uploader = None
3760         else:
3761             video_uploader = result.group('uploader').strip()
3762             video_uploader = clean_html( video_uploader )
3763
3764         # Get all of the formats available
3765         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3766         result = re.search(DOWNLOAD_LIST_RE, webpage)
3767         if result is None:
3768             raise ExtractorError(u'Unable to extract download list')
3769         download_list_html = result.group('download_list').strip()
3770
3771         # Get all of the links from the page
3772         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3773         links = re.findall(LINK_RE, download_list_html)
3774         if(len(links) == 0):
3775             raise ExtractorError(u'ERROR: no known formats available for video')
3776         
3777         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3778
3779         formats = []
3780         for link in links:
3781
3782             # A link looks like this:
3783             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3784             # A path looks like this:
3785             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3786             video_url = unescapeHTML( link )
3787             path = compat_urllib_parse_urlparse( video_url ).path
3788             extension = os.path.splitext( path )[1][1:]
3789             format = path.split('/')[4].split('_')[:2]
3790             size = format[0]
3791             bitrate = format[1]
3792             format = "-".join( format )
3793             title = u'%s-%s-%s' % (video_title, size, bitrate)
3794
3795             formats.append({
3796                 'id': video_id,
3797                 'url': video_url,
3798                 'uploader': video_uploader,
3799                 'upload_date': upload_date,
3800                 'title': title,
3801                 'ext': extension,
3802                 'format': format,
3803                 'thumbnail': None,
3804                 'description': None,
3805                 'player_url': None
3806             })
3807
3808         if self._downloader.params.get('listformats', None):
3809             self._print_formats(formats)
3810             return
3811
3812         req_format = self._downloader.params.get('format', None)
3813         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3814
3815         if req_format is None or req_format == 'best':
3816             return [formats[0]]
3817         elif req_format == 'worst':
3818             return [formats[-1]]
3819         elif req_format in ('-1', 'all'):
3820             return formats
3821         else:
3822             format = self._specific( req_format, formats )
3823             if result is None:
3824                 self._downloader.trouble(u'ERROR: requested format not available')
3825                 return
3826             return [format]
3827
3828         
3829
3830 class PornotubeIE(InfoExtractor):
3831     """Information extractor for pornotube.com."""
3832     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3833
3834     def _real_extract(self, url):
3835         mobj = re.match(self._VALID_URL, url)
3836         if mobj is None:
3837             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3838             return
3839
3840         video_id = mobj.group('videoid')
3841         video_title = mobj.group('title')
3842
3843         # Get webpage content
3844         webpage = self._download_webpage(url, video_id)
3845
3846         # Get the video URL
3847         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3848         result = re.search(VIDEO_URL_RE, webpage)
3849         if result is None:
3850             self._downloader.trouble(u'ERROR: unable to extract video url')
3851             return
3852         video_url = compat_urllib_parse.unquote(result.group('url'))
3853
3854         #Get the uploaded date
3855         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3856         result = re.search(VIDEO_UPLOADED_RE, webpage)
3857         if result is None:
3858             self._downloader.trouble(u'ERROR: unable to extract video title')
3859             return
3860         upload_date = result.group('date')
3861
3862         info = {'id': video_id,
3863                 'url': video_url,
3864                 'uploader': None,
3865                 'upload_date': upload_date,
3866                 'title': video_title,
3867                 'ext': 'flv',
3868                 'format': 'flv'}
3869
3870         return [info]
3871
3872 class YouJizzIE(InfoExtractor):
3873     """Information extractor for youjizz.com."""
3874     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3875
3876     def _real_extract(self, url):
3877         mobj = re.match(self._VALID_URL, url)
3878         if mobj is None:
3879             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3880             return
3881
3882         video_id = mobj.group('videoid')
3883
3884         # Get webpage content
3885         webpage = self._download_webpage(url, video_id)
3886
3887         # Get the video title
3888         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3889         if result is None:
3890             raise ExtractorError(u'ERROR: unable to extract video title')
3891         video_title = result.group('title').strip()
3892
3893         # Get the embed page
3894         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3895         if result is None:
3896             raise ExtractorError(u'ERROR: unable to extract embed page')
3897
3898         embed_page_url = result.group(0).strip()
3899         video_id = result.group('videoid')
3900     
3901         webpage = self._download_webpage(embed_page_url, video_id)
3902
3903         # Get the video URL
3904         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3905         if result is None:
3906             raise ExtractorError(u'ERROR: unable to extract video url')
3907         video_url = result.group('source')
3908
3909         info = {'id': video_id,
3910                 'url': video_url,
3911                 'title': video_title,
3912                 'ext': 'flv',
3913                 'format': 'flv',
3914                 'player_url': embed_page_url}
3915
3916         return [info]
3917
3918 class EightTracksIE(InfoExtractor):
3919     IE_NAME = '8tracks'
3920     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3921
3922     def _real_extract(self, url):
3923         mobj = re.match(self._VALID_URL, url)
3924         if mobj is None:
3925             raise ExtractorError(u'Invalid URL: %s' % url)
3926         playlist_id = mobj.group('id')
3927
3928         webpage = self._download_webpage(url, playlist_id)
3929
3930         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3931         if not m:
3932             raise ExtractorError(u'Cannot find trax information')
3933         json_like = m.group(1)
3934         data = json.loads(json_like)
3935
3936         session = str(random.randint(0, 1000000000))
3937         mix_id = data['id']
3938         track_count = data['tracks_count']
3939         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3940         next_url = first_url
3941         res = []
3942         for i in itertools.count():
3943             api_json = self._download_webpage(next_url, playlist_id,
3944                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3945                 errnote=u'Failed to download song information')
3946             api_data = json.loads(api_json)
3947             track_data = api_data[u'set']['track']
3948             info = {
3949                 'id': track_data['id'],
3950                 'url': track_data['track_file_stream_url'],
3951                 'title': track_data['performer'] + u' - ' + track_data['name'],
3952                 'raw_title': track_data['name'],
3953                 'uploader_id': data['user']['login'],
3954                 'ext': 'm4a',
3955             }
3956             res.append(info)
3957             if api_data['set']['at_last_track']:
3958                 break
3959             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3960         return res
3961
3962 class KeekIE(InfoExtractor):
3963     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3964     IE_NAME = u'keek'
3965
3966     def _real_extract(self, url):
3967         m = re.match(self._VALID_URL, url)
3968         video_id = m.group('videoID')
3969         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3970         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3971         webpage = self._download_webpage(url, video_id)
3972         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3973         title = unescapeHTML(m.group('title'))
3974         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3975         uploader = unescapeHTML(m.group('uploader'))
3976         info = {
3977                 'id':video_id,
3978                 'url':video_url,
3979                 'ext': 'mp4',
3980                 'title': title,
3981                 'thumbnail': thumbnail,
3982                 'uploader': uploader
3983         }
3984         return [info]
3985
3986 class TEDIE(InfoExtractor):
3987     _VALID_URL=r'''http://www.ted.com/
3988                    (
3989                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3990                         |
3991                         ((?P<type_talk>talks)) # We have a simple talk
3992                    )
3993                    /(?P<name>\w+) # Here goes the name and then ".html"
3994                    '''
3995
3996     def suitable(self, url):
3997         """Receives a URL and returns True if suitable for this IE."""
3998         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3999
4000     def _real_extract(self, url):
4001         m=re.match(self._VALID_URL, url, re.VERBOSE)
4002         if m.group('type_talk'):
4003             return [self._talk_info(url)]
4004         else :
4005             playlist_id=m.group('playlist_id')
4006             name=m.group('name')
4007             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4008             return self._playlist_videos_info(url,name,playlist_id)
4009
4010     def _talk_video_link(self,mediaSlug):
4011         '''Returns the video link for that mediaSlug'''
4012         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4013
4014     def _playlist_videos_info(self,url,name,playlist_id=0):
4015         '''Returns the videos of the playlist'''
4016         video_RE=r'''
4017                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4018                      ([.\s]*?)data-playlist_item_id="(\d+)"
4019                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4020                      '''
4021         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4022         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4023         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4024         m_names=re.finditer(video_name_RE,webpage)
4025         info=[]
4026         for m_video, m_name in zip(m_videos,m_names):
4027             video_id=m_video.group('video_id')
4028             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4029             info.append(self._talk_info(talk_url,video_id))
4030         return info
4031
4032     def _talk_info(self, url, video_id=0):
4033         """Return the video for the talk in the url"""
4034         m=re.match(self._VALID_URL, url,re.VERBOSE)
4035         videoName=m.group('name')
4036         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4037         # If the url includes the language we get the title translated
4038         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4039         title=re.search(title_RE, webpage).group('title')
4040         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4041                         "id":(?P<videoID>[\d]+).*?
4042                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4043         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4044         thumb_match=re.search(thumb_RE,webpage)
4045         info_match=re.search(info_RE,webpage,re.VERBOSE)
4046         video_id=info_match.group('videoID')
4047         mediaSlug=info_match.group('mediaSlug')
4048         video_url=self._talk_video_link(mediaSlug)
4049         info = {
4050                 'id': video_id,
4051                 'url': video_url,
4052                 'ext': 'mp4',
4053                 'title': title,
4054                 'thumbnail': thumb_match.group('thumbnail')
4055                 }
4056         return info
4057
4058 class MySpassIE(InfoExtractor):
4059     _VALID_URL = r'http://www.myspass.de/.*'
4060     
4061     def _real_extract(self, url):
4062         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4063
4064         # video id is the last path element of the URL
4065         # usually there is a trailing slash, so also try the second but last
4066         url_path = compat_urllib_parse_urlparse(url).path
4067         url_parent_path, video_id = os.path.split(url_path)
4068         if not video_id:
4069             _, video_id = os.path.split(url_parent_path)
4070         
4071         # get metadata
4072         metadata_url = META_DATA_URL_TEMPLATE % video_id
4073         metadata_text = self._download_webpage(metadata_url, video_id)
4074         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4075         
4076         # extract values from metadata
4077         url_flv_el = metadata.find('url_flv')
4078         if url_flv_el is None:
4079             self._downloader.trouble(u'ERROR: unable to extract download url')
4080             return
4081         video_url = url_flv_el.text
4082         extension = os.path.splitext(video_url)[1][1:]
4083         title_el = metadata.find('title')
4084         if title_el is None:
4085             self._downloader.trouble(u'ERROR: unable to extract title')
4086             return
4087         title = title_el.text
4088         format_id_el = metadata.find('format_id')
4089         if format_id_el is None:
4090             format = ext
4091         else:
4092             format = format_id_el.text
4093         description_el = metadata.find('description')
4094         if description_el is not None:
4095             description = description_el.text
4096         else:
4097             description = None
4098         imagePreview_el = metadata.find('imagePreview')
4099         if imagePreview_el is not None:
4100             thumbnail = imagePreview_el.text
4101         else:
4102             thumbnail = None
4103         info = {
4104             'id': video_id,
4105             'url': video_url,
4106             'title': title,
4107             'ext': extension,
4108             'format': format,
4109             'thumbnail': thumbnail,
4110             'description': description
4111         }
4112         return [info]
4113
4114 def gen_extractors():
4115     """ Return a list of an instance of every supported extractor.
4116     The order does matter; the first extractor matched is the one handling the URL.
4117     """
4118     return [
4119         YoutubePlaylistIE(),
4120         YoutubeChannelIE(),
4121         YoutubeUserIE(),
4122         YoutubeSearchIE(),
4123         YoutubeIE(),
4124         MetacafeIE(),
4125         DailymotionIE(),
4126         GoogleSearchIE(),
4127         PhotobucketIE(),
4128         YahooIE(),
4129         YahooSearchIE(),
4130         DepositFilesIE(),
4131         FacebookIE(),
4132         BlipTVUserIE(),
4133         BlipTVIE(),
4134         VimeoIE(),
4135         MyVideoIE(),
4136         ComedyCentralIE(),
4137         EscapistIE(),
4138         CollegeHumorIE(),
4139         XVideosIE(),
4140         SoundcloudIE(),
4141         InfoQIE(),
4142         MixcloudIE(),
4143         StanfordOpenClassroomIE(),
4144         MTVIE(),
4145         YoukuIE(),
4146         XNXXIE(),
4147         YouJizzIE(),
4148         PornotubeIE(),
4149         YouPornIE(),
4150         GooglePlusIE(),
4151         ArteTvIE(),
4152         NBAIE(),
4153         JustinTVIE(),
4154         FunnyOrDieIE(),
4155         TweetReelIE(),
4156         SteamIE(),
4157         UstreamIE(),
4158         RBMARadioIE(),
4159         EightTracksIE(),
4160         KeekIE(),
4161         TEDIE(),
4162         MySpassIE(),
4163         GenericIE()
4164     ]
4165
4166