Formalize URL creation (prepare for some cleanup in blip.tv:users)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23     """Information Extractor class.
24
25     Information extractors are the classes that, given a URL, extract
26     information about the video (or videos) the URL refers to. This
27     information includes the real video URL, the video title, author and
28     others. The information is stored in a dictionary which is then
29     passed to the FileDownloader. The FileDownloader processes this
30     information possibly downloading the video to the file system, among
31     other possible outcomes.
32
33     The dictionaries must include the following fields:
34
35     id:             Video identifier.
36     url:            Final video URL.
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader:       Full name of the video uploader.
46     upload_date:    Video upload date (YYYYMMDD).
47     uploader_id:    Nickname or id of the video uploader.
48     location:       Physical location of the video.
49     player_url:     SWF Player URL (used for rtmpdump).
50     subtitles:      The .srt file contents.
51     urlhandle:      [internal] The urlHandle to be used to download the file,
52                     like returned by urllib.request.urlopen
53
54     The fields should all be Unicode strings.
55
56     Subclasses of this one should re-define the _real_initialize() and
57     _real_extract() methods and define a _VALID_URL regexp.
58     Probably, they should also be added to the list of extractors.
59
60     _real_extract() must return a *list* of information dictionaries as
61     described above.
62
63     Finally, the _WORKING attribute should be set to False for broken IEs
64     in order to warn the users and skip the tests.
65     """
66
67     _ready = False
68     _downloader = None
69     _WORKING = True
70
71     def __init__(self, downloader=None):
72         """Constructor. Receives an optional downloader."""
73         self._ready = False
74         self.set_downloader(downloader)
75
76     def suitable(self, url):
77         """Receives a URL and returns True if suitable for this IE."""
78         return re.match(self._VALID_URL, url) is not None
79
80     def working(self):
81         """Getter method for _WORKING."""
82         return self._WORKING
83
84     def initialize(self):
85         """Initializes an instance (authentication, etc)."""
86         if not self._ready:
87             self._real_initialize()
88             self._ready = True
89
90     def extract(self, url):
91         """Extracts URL information and returns it in list of dicts."""
92         self.initialize()
93         return self._real_extract(url)
94
95     def set_downloader(self, downloader):
96         """Sets the downloader for this IE."""
97         self._downloader = downloader
98
99     def _real_initialize(self):
100         """Real initialization process. Redefine in subclasses."""
101         pass
102
103     def _real_extract(self, url):
104         """Real extraction process. Redefine in subclasses."""
105         pass
106
107     @property
108     def IE_NAME(self):
109         return type(self).__name__[:-2]
110
111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112         """ Returns the response handle """
113         if note is None:
114             note = u'Downloading video webpage'
115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116         try:
117             return compat_urllib_request.urlopen(url_or_request)
118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119             if errnote is None:
120                 errnote = u'Unable to download webpage'
121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124         """ Returns the data of the page as a string """
125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126         webpage_bytes = urlh.read()
127         return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131     """Information extractor for youtube.com."""
132
133     _VALID_URL = r"""^
134                      (
135                          (?:https?://)?                                       # http(s):// (optional)
136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
140                          (?:                                                  # the various things that can precede the ID:
141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
142                              |(?:                                             # or the v= param in all its forms
143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
146                                  v=
147                              )
148                          )?                                                   # optional -> youtube.com/xxxx is OK
149                      )?                                                       # all until now is optional -> you can pass the naked ID
150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
151                      (?(1).+)?                                                # if we found the ID, everything can follow
152                      $"""
153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157     _NETRC_MACHINE = 'youtube'
158     # Listed in order of quality
159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161     _video_extensions = {
162         '13': '3gp',
163         '17': 'mp4',
164         '18': 'mp4',
165         '22': 'mp4',
166         '37': 'mp4',
167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168         '43': 'webm',
169         '44': 'webm',
170         '45': 'webm',
171         '46': 'webm',
172     }
173     _video_dimensions = {
174         '5': '240x400',
175         '6': '???',
176         '13': '???',
177         '17': '144x176',
178         '18': '360x640',
179         '22': '720x1280',
180         '34': '360x640',
181         '35': '480x854',
182         '37': '1080x1920',
183         '38': '3072x4096',
184         '43': '360x640',
185         '44': '480x854',
186         '45': '720x1280',
187         '46': '1080x1920',
188     }
189     IE_NAME = u'youtube'
190
191     def suitable(self, url):
192         """Receives a URL and returns True if suitable for this IE."""
193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195     def report_lang(self):
196         """Report attempt to set language."""
197         self._downloader.to_screen(u'[youtube] Setting language')
198
199     def report_login(self):
200         """Report attempt to log in."""
201         self._downloader.to_screen(u'[youtube] Logging in')
202
203     def report_age_confirmation(self):
204         """Report attempt to confirm age."""
205         self._downloader.to_screen(u'[youtube] Confirming age')
206
207     def report_video_webpage_download(self, video_id):
208         """Report attempt to download video webpage."""
209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211     def report_video_info_webpage_download(self, video_id):
212         """Report attempt to download video info webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215     def report_video_subtitles_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219     def report_information_extraction(self, video_id):
220         """Report attempt to extract video information."""
221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223     def report_unavailable_format(self, video_id, format):
224         """Report extracted video URL."""
225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227     def report_rtmp_download(self):
228         """Indicate the download will use the RTMP protocol."""
229         self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231     def _closed_captions_xml_to_srt(self, xml_string):
232         srt = ''
233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234         # TODO parse xml instead of regex
235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
236             if not dur: dur = '4'
237             start = float(start)
238             end = start + float(dur)
239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241             caption = unescapeHTML(caption)
242             caption = unescapeHTML(caption) # double cycle, intentional
243             srt += str(n+1) + '\n'
244             srt += start + ' --> ' + end + '\n'
245             srt += caption + '\n\n'
246         return srt
247
248     def _extract_subtitles(self, video_id):
249         self.report_video_subtitles_download(video_id)
250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251         try:
252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257         if not srt_lang_list:
258             return (u'WARNING: video has no closed captions', None)
259         if self._downloader.params.get('subtitleslang', False):
260             srt_lang = self._downloader.params.get('subtitleslang')
261         elif 'en' in srt_lang_list:
262             srt_lang = 'en'
263         else:
264             srt_lang = list(srt_lang_list.keys())[0]
265         if not srt_lang in srt_lang_list:
266             return (u'WARNING: no closed captions found in the specified language', None)
267         params = compat_urllib_parse.urlencode({
268             'lang': srt_lang,
269             'name': srt_lang_list[srt_lang].encode('utf-8'),
270             'v': video_id,
271         })
272         url = 'http://www.youtube.com/api/timedtext?' + params
273         try:
274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277         if not srt_xml:
278             return (u'WARNING: Did not fetch video subtitles', None)
279         return (None, self._closed_captions_xml_to_srt(srt_xml))
280
281     def _print_formats(self, formats):
282         print('Available formats:')
283         for x in formats:
284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
285
286     def _real_initialize(self):
287         if self._downloader is None:
288             return
289
290         username = None
291         password = None
292         downloader_params = self._downloader.params
293
294         # Attempt to use provided username and password or .netrc data
295         if downloader_params.get('username', None) is not None:
296             username = downloader_params['username']
297             password = downloader_params['password']
298         elif downloader_params.get('usenetrc', False):
299             try:
300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
301                 if info is not None:
302                     username = info[0]
303                     password = info[2]
304                 else:
305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306             except (IOError, netrc.NetrcParseError) as err:
307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
308                 return
309
310         # Set language
311         request = compat_urllib_request.Request(self._LANG_URL)
312         try:
313             self.report_lang()
314             compat_urllib_request.urlopen(request).read()
315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
317             return
318
319         # No authentication to be performed
320         if username is None:
321             return
322
323         request = compat_urllib_request.Request(self._LOGIN_URL)
324         try:
325             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
328             return
329
330         galx = None
331         dsh = None
332         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
333         if match:
334           galx = match.group(1)
335
336         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           dsh = match.group(1)
339
340         # Log in
341         login_form_strs = {
342                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
343                 u'Email': username,
344                 u'GALX': galx,
345                 u'Passwd': password,
346                 u'PersistentCookie': u'yes',
347                 u'_utf8': u'霱',
348                 u'bgresponse': u'js_disabled',
349                 u'checkConnection': u'',
350                 u'checkedDomains': u'youtube',
351                 u'dnConn': u'',
352                 u'dsh': dsh,
353                 u'pstMsg': u'0',
354                 u'rmShown': u'1',
355                 u'secTok': u'',
356                 u'signIn': u'Sign in',
357                 u'timeStmp': u'',
358                 u'service': u'youtube',
359                 u'uilel': u'3',
360                 u'hl': u'en_US',
361         }
362         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
363         # chokes on unicode
364         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
367         try:
368             self.report_login()
369             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
372                 return
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
375             return
376
377         # Confirm age
378         age_form = {
379                 'next_url':     '/',
380                 'action_confirm':   'Confirm',
381                 }
382         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
383         try:
384             self.report_age_confirmation()
385             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
388             return
389
390     def _extract_id(self, url):
391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
392         if mobj is None:
393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
394             return
395         video_id = mobj.group(2)
396         return video_id
397
398     def _real_extract(self, url):
399         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400         mobj = re.search(self._NEXT_URL_RE, url)
401         if mobj:
402             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403         video_id = self._extract_id(url)
404
405         # Get video webpage
406         self.report_video_webpage_download(video_id)
407         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408         request = compat_urllib_request.Request(url)
409         try:
410             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
413             return
414
415         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
416
417         # Attempt to extract SWF player URL
418         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
419         if mobj is not None:
420             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
421         else:
422             player_url = None
423
424         # Get video info
425         self.report_video_info_webpage_download(video_id)
426         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428                     % (video_id, el_type))
429             request = compat_urllib_request.Request(video_info_url)
430             try:
431                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433                 video_info = compat_parse_qs(video_info_webpage)
434                 if 'token' in video_info:
435                     break
436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
438                 return
439         if 'token' not in video_info:
440             if 'reason' in video_info:
441                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
442             else:
443                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
444             return
445
446         # Check for "rental" videos
447         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448             self._downloader.trouble(u'ERROR: "rental" videos not supported')
449             return
450
451         # Start extracting information
452         self.report_information_extraction(video_id)
453
454         # uploader
455         if 'author' not in video_info:
456             self._downloader.trouble(u'ERROR: unable to extract uploader name')
457             return
458         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
459
460         # uploader_id
461         video_uploader_id = None
462         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
463         if mobj is not None:
464             video_uploader_id = mobj.group(1)
465         else:
466             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
467
468         # title
469         if 'title' not in video_info:
470             self._downloader.trouble(u'ERROR: unable to extract video title')
471             return
472         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
473
474         # thumbnail image
475         if 'thumbnail_url' not in video_info:
476             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
477             video_thumbnail = ''
478         else:   # don't panic if we can't find it
479             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
480
481         # upload date
482         upload_date = None
483         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
484         if mobj is not None:
485             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487             for expression in format_expressions:
488                 try:
489                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
490                 except:
491                     pass
492
493         # description
494         video_description = get_element_by_id("eow-description", video_webpage)
495         if video_description:
496             video_description = clean_html(video_description)
497         else:
498             video_description = ''
499
500         # closed captions
501         video_subtitles = None
502         if self._downloader.params.get('writesubtitles', False):
503             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
504             if srt_error:
505                 self._downloader.trouble(srt_error)
506
507         if 'length_seconds' not in video_info:
508             self._downloader.trouble(u'WARNING: unable to extract video duration')
509             video_duration = ''
510         else:
511             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
512
513         # token
514         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
515
516         # Decide which formats to download
517         req_format = self._downloader.params.get('format', None)
518
519         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520             self.report_rtmp_download()
521             video_url_list = [(None, video_info['conn'][0])]
522         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
527
528             format_limit = self._downloader.params.get('format_limit', None)
529             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530             if format_limit is not None and format_limit in available_formats:
531                 format_list = available_formats[available_formats.index(format_limit):]
532             else:
533                 format_list = available_formats
534             existing_formats = [x for x in format_list if x in url_map]
535             if len(existing_formats) == 0:
536                 self._downloader.trouble(u'ERROR: no known formats available for video')
537                 return
538             if self._downloader.params.get('listformats', None):
539                 self._print_formats(existing_formats)
540                 return
541             if req_format is None or req_format == 'best':
542                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543             elif req_format == 'worst':
544                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545             elif req_format in ('-1', 'all'):
546                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
547             else:
548                 # Specific formats. We pick the first in a slash-delimeted sequence.
549                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550                 req_formats = req_format.split('/')
551                 video_url_list = None
552                 for rf in req_formats:
553                     if rf in url_map:
554                         video_url_list = [(rf, url_map[rf])]
555                         break
556                 if video_url_list is None:
557                     self._downloader.trouble(u'ERROR: requested format not available')
558                     return
559         else:
560             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
561             return
562
563         results = []
564         for format_param, video_real_url in video_url_list:
565             # Extension
566             video_extension = self._video_extensions.get(format_param, 'flv')
567
568             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569                                               self._video_dimensions.get(format_param, '???'))
570
571             results.append({
572                 'id':       video_id,
573                 'url':      video_real_url,
574                 'uploader': video_uploader,
575                 'uploader_id': video_uploader_id,
576                 'upload_date':  upload_date,
577                 'title':    video_title,
578                 'ext':      video_extension,
579                 'format':   video_format,
580                 'thumbnail':    video_thumbnail,
581                 'description':  video_description,
582                 'player_url':   player_url,
583                 'subtitles':    video_subtitles,
584                 'duration':     video_duration
585             })
586         return results
587
588
589 class MetacafeIE(InfoExtractor):
590     """Information Extractor for metacafe.com."""
591
592     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595     IE_NAME = u'metacafe'
596
597     def __init__(self, downloader=None):
598         InfoExtractor.__init__(self, downloader)
599
600     def report_disclaimer(self):
601         """Report disclaimer retrieval."""
602         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
603
604     def report_age_confirmation(self):
605         """Report attempt to confirm age."""
606         self._downloader.to_screen(u'[metacafe] Confirming age')
607
608     def report_download_webpage(self, video_id):
609         """Report webpage download."""
610         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
611
612     def report_extraction(self, video_id):
613         """Report information extraction."""
614         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
615
616     def _real_initialize(self):
617         # Retrieve disclaimer
618         request = compat_urllib_request.Request(self._DISCLAIMER)
619         try:
620             self.report_disclaimer()
621             disclaimer = compat_urllib_request.urlopen(request).read()
622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
624             return
625
626         # Confirm age
627         disclaimer_form = {
628             'filters': '0',
629             'submit': "Continue - I'm over 18",
630             }
631         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
632         try:
633             self.report_age_confirmation()
634             disclaimer = compat_urllib_request.urlopen(request).read()
635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
637             return
638
639     def _real_extract(self, url):
640         # Extract id and simplified title from URL
641         mobj = re.match(self._VALID_URL, url)
642         if mobj is None:
643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
644             return
645
646         video_id = mobj.group(1)
647
648         # Check if video comes from YouTube
649         mobj2 = re.match(r'^yt-(.*)$', video_id)
650         if mobj2 is not None:
651             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
652             return
653
654         # Retrieve video webpage to extract further information
655         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
656         try:
657             self.report_download_webpage(video_id)
658             webpage = compat_urllib_request.urlopen(request).read()
659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
661             return
662
663         # Extract URL, uploader and title from webpage
664         self.report_extraction(video_id)
665         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
666         if mobj is not None:
667             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668             video_extension = mediaURL[-3:]
669
670             # Extract gdaKey if available
671             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
672             if mobj is None:
673                 video_url = mediaURL
674             else:
675                 gdaKey = mobj.group(1)
676                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
677         else:
678             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
679             if mobj is None:
680                 self._downloader.trouble(u'ERROR: unable to extract media URL')
681                 return
682             vardict = compat_parse_qs(mobj.group(1))
683             if 'mediaData' not in vardict:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
687             if mobj is None:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mediaURL = mobj.group(1).replace('\\/', '/')
691             video_extension = mediaURL[-3:]
692             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
693
694         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
695         if mobj is None:
696             self._downloader.trouble(u'ERROR: unable to extract title')
697             return
698         video_title = mobj.group(1).decode('utf-8')
699
700         mobj = re.search(r'submitter=(.*?);', webpage)
701         if mobj is None:
702             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
703             return
704         video_uploader = mobj.group(1)
705
706         return [{
707             'id':       video_id.decode('utf-8'),
708             'url':      video_url.decode('utf-8'),
709             'uploader': video_uploader.decode('utf-8'),
710             'upload_date':  None,
711             'title':    video_title,
712             'ext':      video_extension.decode('utf-8'),
713         }]
714
715
716 class DailymotionIE(InfoExtractor):
717     """Information Extractor for Dailymotion"""
718
719     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720     IE_NAME = u'dailymotion'
721     _WORKING = False
722
723     def __init__(self, downloader=None):
724         InfoExtractor.__init__(self, downloader)
725
726     def report_extraction(self, video_id):
727         """Report information extraction."""
728         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
729
730     def _real_extract(self, url):
731         # Extract id and simplified title from URL
732         mobj = re.match(self._VALID_URL, url)
733         if mobj is None:
734             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
735             return
736
737         video_id = mobj.group(1).split('_')[0].split('?')[0]
738
739         video_extension = 'mp4'
740
741         # Retrieve video webpage to extract further information
742         request = compat_urllib_request.Request(url)
743         request.add_header('Cookie', 'family_filter=off')
744         webpage = self._download_webpage(request, video_id)
745
746         # Extract URL, uploader and title from webpage
747         self.report_extraction(video_id)
748         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
749         if mobj is None:
750             self._downloader.trouble(u'ERROR: unable to extract media URL')
751             return
752         flashvars = compat_urllib_parse.unquote(mobj.group(1))
753
754         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
755             if key in flashvars:
756                 max_quality = key
757                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
758                 break
759         else:
760             self._downloader.trouble(u'ERROR: unable to extract video URL')
761             return
762
763         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
764         if mobj is None:
765             self._downloader.trouble(u'ERROR: unable to extract video URL')
766             return
767
768         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
769
770         # TODO: support choosing qualities
771
772         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
773         if mobj is None:
774             self._downloader.trouble(u'ERROR: unable to extract title')
775             return
776         video_title = unescapeHTML(mobj.group('title'))
777
778         video_uploader = None
779         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
780         if mobj is None:
781             # lookin for official user
782             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783             if mobj_official is None:
784                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
785             else:
786                 video_uploader = mobj_official.group(1)
787         else:
788             video_uploader = mobj.group(1)
789
790         video_upload_date = None
791         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
792         if mobj is not None:
793             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
794
795         return [{
796             'id':       video_id,
797             'url':      video_url,
798             'uploader': video_uploader,
799             'upload_date':  video_upload_date,
800             'title':    video_title,
801             'ext':      video_extension,
802         }]
803
804
805 class PhotobucketIE(InfoExtractor):
806     """Information extractor for photobucket.com."""
807
808     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809     IE_NAME = u'photobucket'
810
811     def __init__(self, downloader=None):
812         InfoExtractor.__init__(self, downloader)
813
814     def report_download_webpage(self, video_id):
815         """Report webpage download."""
816         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
817
818     def report_extraction(self, video_id):
819         """Report information extraction."""
820         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
821
822     def _real_extract(self, url):
823         # Extract id from URL
824         mobj = re.match(self._VALID_URL, url)
825         if mobj is None:
826             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
827             return
828
829         video_id = mobj.group(1)
830
831         video_extension = 'flv'
832
833         # Retrieve video webpage to extract further information
834         request = compat_urllib_request.Request(url)
835         try:
836             self.report_download_webpage(video_id)
837             webpage = compat_urllib_request.urlopen(request).read()
838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
840             return
841
842         # Extract URL, uploader, and title from webpage
843         self.report_extraction(video_id)
844         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
845         if mobj is None:
846             self._downloader.trouble(u'ERROR: unable to extract media URL')
847             return
848         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
849
850         video_url = mediaURL
851
852         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
853         if mobj is None:
854             self._downloader.trouble(u'ERROR: unable to extract title')
855             return
856         video_title = mobj.group(1).decode('utf-8')
857
858         video_uploader = mobj.group(2).decode('utf-8')
859
860         return [{
861             'id':       video_id.decode('utf-8'),
862             'url':      video_url.decode('utf-8'),
863             'uploader': video_uploader,
864             'upload_date':  None,
865             'title':    video_title,
866             'ext':      video_extension.decode('utf-8'),
867         }]
868
869
870 class YahooIE(InfoExtractor):
871     """Information extractor for video.yahoo.com."""
872
873     _WORKING = False
874     # _VALID_URL matches all Yahoo! Video URLs
875     # _VPAGE_URL matches only the extractable '/watch/' URLs
876     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878     IE_NAME = u'video.yahoo'
879
880     def __init__(self, downloader=None):
881         InfoExtractor.__init__(self, downloader)
882
883     def report_download_webpage(self, video_id):
884         """Report webpage download."""
885         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
886
887     def report_extraction(self, video_id):
888         """Report information extraction."""
889         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
890
891     def _real_extract(self, url, new_video=True):
892         # Extract ID from URL
893         mobj = re.match(self._VALID_URL, url)
894         if mobj is None:
895             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
896             return
897
898         video_id = mobj.group(2)
899         video_extension = 'flv'
900
901         # Rewrite valid but non-extractable URLs as
902         # extractable English language /watch/ URLs
903         if re.match(self._VPAGE_URL, url) is None:
904             request = compat_urllib_request.Request(url)
905             try:
906                 webpage = compat_urllib_request.urlopen(request).read()
907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
909                 return
910
911             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
912             if mobj is None:
913                 self._downloader.trouble(u'ERROR: Unable to extract id field')
914                 return
915             yahoo_id = mobj.group(1)
916
917             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
918             if mobj is None:
919                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
920                 return
921             yahoo_vid = mobj.group(1)
922
923             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924             return self._real_extract(url, new_video=False)
925
926         # Retrieve video webpage to extract further information
927         request = compat_urllib_request.Request(url)
928         try:
929             self.report_download_webpage(video_id)
930             webpage = compat_urllib_request.urlopen(request).read()
931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933             return
934
935         # Extract uploader and title from webpage
936         self.report_extraction(video_id)
937         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
938         if mobj is None:
939             self._downloader.trouble(u'ERROR: unable to extract video title')
940             return
941         video_title = mobj.group(1).decode('utf-8')
942
943         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
944         if mobj is None:
945             self._downloader.trouble(u'ERROR: unable to extract video uploader')
946             return
947         video_uploader = mobj.group(1).decode('utf-8')
948
949         # Extract video thumbnail
950         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
951         if mobj is None:
952             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
953             return
954         video_thumbnail = mobj.group(1).decode('utf-8')
955
956         # Extract video description
957         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
958         if mobj is None:
959             self._downloader.trouble(u'ERROR: unable to extract video description')
960             return
961         video_description = mobj.group(1).decode('utf-8')
962         if not video_description:
963             video_description = 'No description available.'
964
965         # Extract video height and width
966         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
967         if mobj is None:
968             self._downloader.trouble(u'ERROR: unable to extract video height')
969             return
970         yv_video_height = mobj.group(1)
971
972         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
973         if mobj is None:
974             self._downloader.trouble(u'ERROR: unable to extract video width')
975             return
976         yv_video_width = mobj.group(1)
977
978         # Retrieve video playlist to extract media URL
979         # I'm not completely sure what all these options are, but we
980         # seem to need most of them, otherwise the server sends a 401.
981         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
982         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
983         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
986         try:
987             self.report_download_webpage(video_id)
988             webpage = compat_urllib_request.urlopen(request).read()
989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
991             return
992
993         # Extract media URL from playlist XML
994         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
995         if mobj is None:
996             self._downloader.trouble(u'ERROR: Unable to extract media URL')
997             return
998         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999         video_url = unescapeHTML(video_url)
1000
1001         return [{
1002             'id':       video_id.decode('utf-8'),
1003             'url':      video_url,
1004             'uploader': video_uploader,
1005             'upload_date':  None,
1006             'title':    video_title,
1007             'ext':      video_extension.decode('utf-8'),
1008             'thumbnail':    video_thumbnail.decode('utf-8'),
1009             'description':  video_description,
1010         }]
1011
1012
1013 class VimeoIE(InfoExtractor):
1014     """Information extractor for vimeo.com."""
1015
1016     # _VALID_URL matches Vimeo URLs
1017     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1018     IE_NAME = u'vimeo'
1019
1020     def __init__(self, downloader=None):
1021         InfoExtractor.__init__(self, downloader)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026
1027     def report_extraction(self, video_id):
1028         """Report information extraction."""
1029         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1036             return
1037
1038         video_id = mobj.group('id')
1039         if not mobj.group('proto'):
1040             url = 'https://' + url
1041         if mobj.group('direct_link'):
1042             url = 'https://vimeo.com/' + video_id
1043
1044         # Retrieve video webpage to extract further information
1045         request = compat_urllib_request.Request(url, None, std_headers)
1046         try:
1047             self.report_download_webpage(video_id)
1048             webpage_bytes = compat_urllib_request.urlopen(request).read()
1049             webpage = webpage_bytes.decode('utf-8')
1050         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052             return
1053
1054         # Now we begin extracting as much information as we can from what we
1055         # retrieved. First we extract the information common to all extractors,
1056         # and latter we extract those that are Vimeo specific.
1057         self.report_extraction(video_id)
1058
1059         # Extract the config JSON
1060         try:
1061             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062             config = json.loads(config)
1063         except:
1064             self._downloader.trouble(u'ERROR: unable to extract info section')
1065             return
1066
1067         # Extract title
1068         video_title = config["video"]["title"]
1069
1070         # Extract uploader and uploader_id
1071         video_uploader = config["video"]["owner"]["name"]
1072         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1073
1074         # Extract video thumbnail
1075         video_thumbnail = config["video"]["thumbnail"]
1076
1077         # Extract video description
1078         video_description = get_element_by_attribute("itemprop", "description", webpage)
1079         if video_description: video_description = clean_html(video_description)
1080         else: video_description = ''
1081
1082         # Extract upload date
1083         video_upload_date = None
1084         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085         if mobj is not None:
1086             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1087
1088         # Vimeo specific: extract request signature and timestamp
1089         sig = config['request']['signature']
1090         timestamp = config['request']['timestamp']
1091
1092         # Vimeo specific: extract video codec and quality information
1093         # First consider quality, then codecs, then take everything
1094         # TODO bind to format param
1095         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096         files = { 'hd': [], 'sd': [], 'other': []}
1097         for codec_name, codec_extension in codecs:
1098             if codec_name in config["video"]["files"]:
1099                 if 'hd' in config["video"]["files"][codec_name]:
1100                     files['hd'].append((codec_name, codec_extension, 'hd'))
1101                 elif 'sd' in config["video"]["files"][codec_name]:
1102                     files['sd'].append((codec_name, codec_extension, 'sd'))
1103                 else:
1104                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105
1106         for quality in ('hd', 'sd', 'other'):
1107             if len(files[quality]) > 0:
1108                 video_quality = files[quality][0][2]
1109                 video_codec = files[quality][0][0]
1110                 video_extension = files[quality][0][1]
1111                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1112                 break
1113         else:
1114             self._downloader.trouble(u'ERROR: no known codec found')
1115             return
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def __init__(self, downloader=None):
1142         InfoExtractor.__init__(self, downloader)
1143
1144     def report_download_webpage(self, video_id):
1145         """Report webpage download."""
1146         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147
1148     def report_extraction(self, video_id):
1149         """Report information extraction."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151
1152     def fetch_webpage(self, url):
1153         request = compat_urllib_request.Request(url)
1154         try:
1155             self.report_download_webpage(url)
1156             webpage = compat_urllib_request.urlopen(request).read()
1157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159             return
1160         except ValueError as err:
1161             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1162             return
1163         return webpage
1164
1165     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166         page = self.fetch_webpage(url)
1167         mobj = re.search(regex, page, regexFlags)
1168         info = {}
1169
1170         if mobj is None:
1171             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172             return
1173
1174         for (i, key, err) in matchTuples:
1175             if mobj.group(i) is None:
1176                 self._downloader.trouble(err)
1177                 return
1178             else:
1179                 info[key] = mobj.group(i)
1180
1181         return info
1182
1183     def extractLiveStream(self, url):
1184         video_lang = url.split('/')[-4]
1185         info = self.grep_webpage(
1186             url,
1187             r'src="(.*?/videothek_js.*?\.js)',
1188             0,
1189             [
1190                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1191             ]
1192         )
1193         http_host = url.split('/')[2]
1194         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195         info = self.grep_webpage(
1196             next_url,
1197             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198                 '(http://.*?\.swf).*?' +
1199                 '(rtmp://.*?)\'',
1200             re.DOTALL,
1201             [
1202                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1203                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1205             ]
1206         )
1207         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208
1209     def extractPlus7Stream(self, url):
1210         video_lang = url.split('/')[-3]
1211         info = self.grep_webpage(
1212             url,
1213             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1214             0,
1215             [
1216                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1217             ]
1218         )
1219         next_url = compat_urllib_parse.unquote(info.get('url'))
1220         info = self.grep_webpage(
1221             next_url,
1222             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1223             0,
1224             [
1225                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1226             ]
1227         )
1228         next_url = compat_urllib_parse.unquote(info.get('url'))
1229
1230         info = self.grep_webpage(
1231             next_url,
1232             r'<video id="(.*?)".*?>.*?' +
1233                 '<name>(.*?)</name>.*?' +
1234                 '<dateVideo>(.*?)</dateVideo>.*?' +
1235                 '<url quality="hd">(.*?)</url>',
1236             re.DOTALL,
1237             [
1238                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1239                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1241                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1242             ]
1243         )
1244
1245         return {
1246             'id':           info.get('id'),
1247             'url':          compat_urllib_parse.unquote(info.get('url')),
1248             'uploader':     u'arte.tv',
1249             'upload_date':  info.get('date'),
1250             'title':        info.get('title').decode('utf-8'),
1251             'ext':          u'mp4',
1252             'format':       u'NA',
1253             'player_url':   None,
1254         }
1255
1256     def _real_extract(self, url):
1257         video_id = url.split('/')[-1]
1258         self.report_extraction(video_id)
1259
1260         if re.search(self._LIVE_URL, video_id) is not None:
1261             self.extractLiveStream(url)
1262             return
1263         else:
1264             info = self.extractPlus7Stream(url)
1265
1266         return [info]
1267
1268
1269 class GenericIE(InfoExtractor):
1270     """Generic last-resort information extractor."""
1271
1272     _VALID_URL = r'.*'
1273     IE_NAME = u'generic'
1274
1275     def __init__(self, downloader=None):
1276         InfoExtractor.__init__(self, downloader)
1277
1278     def report_download_webpage(self, video_id):
1279         """Report webpage download."""
1280         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282
1283     def report_extraction(self, video_id):
1284         """Report information extraction."""
1285         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286
1287     def report_following_redirect(self, new_url):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1290
1291     def _test_redirect(self, url):
1292         """Check if it is a redirect, like url shorteners, in case restart chain."""
1293         class HeadRequest(compat_urllib_request.Request):
1294             def get_method(self):
1295                 return "HEAD"
1296
1297         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298             """
1299             Subclass the HTTPRedirectHandler to make it use our
1300             HeadRequest also on the redirected URL
1301             """
1302             def redirect_request(self, req, fp, code, msg, headers, newurl):
1303                 if code in (301, 302, 303, 307):
1304                     newurl = newurl.replace(' ', '%20')
1305                     newheaders = dict((k,v) for k,v in req.headers.items()
1306                                       if k.lower() not in ("content-length", "content-type"))
1307                     return HeadRequest(newurl,
1308                                        headers=newheaders,
1309                                        origin_req_host=req.get_origin_req_host(),
1310                                        unverifiable=True)
1311                 else:
1312                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1313
1314         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315             """
1316             Fallback to GET if HEAD is not allowed (405 HTTP error)
1317             """
1318             def http_error_405(self, req, fp, code, msg, headers):
1319                 fp.read()
1320                 fp.close()
1321
1322                 newheaders = dict((k,v) for k,v in req.headers.items()
1323                                   if k.lower() not in ("content-length", "content-type"))
1324                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325                                                  headers=newheaders,
1326                                                  origin_req_host=req.get_origin_req_host(),
1327                                                  unverifiable=True))
1328
1329         # Build our opener
1330         opener = compat_urllib_request.OpenerDirector()
1331         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332                         HTTPMethodFallback, HEADRedirectHandler,
1333                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334             opener.add_handler(handler())
1335
1336         response = opener.open(HeadRequest(url))
1337         new_url = response.geturl()
1338
1339         if url == new_url:
1340             return False
1341
1342         self.report_following_redirect(new_url)
1343         self._downloader.download([new_url])
1344         return True
1345
1346     def _real_extract(self, url):
1347         if self._test_redirect(url): return
1348
1349         video_id = url.split('/')[-1]
1350         request = compat_urllib_request.Request(url)
1351         try:
1352             self.report_download_webpage(video_id)
1353             webpage = compat_urllib_request.urlopen(request).read()
1354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356             return
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             # Broaden the search a little bit: JWPlayer JS loader
1371             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1372         if mobj is None:
1373             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1374             return
1375
1376         # It's possible that one of the regexes
1377         # matched, but returned an empty group:
1378         if mobj.group(1) is None:
1379             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380             return
1381
1382         video_url = compat_urllib_parse.unquote(mobj.group(1))
1383         video_id = os.path.basename(video_url)
1384
1385         # here's a fun little line of code for you:
1386         video_extension = os.path.splitext(video_id)[1][1:]
1387         video_id = os.path.splitext(video_id)[0]
1388
1389         # it's tempting to parse this further, but you would
1390         # have to take into account all the variations like
1391         #   Video Title - Site Name
1392         #   Site Name | Video Title
1393         #   Video Title - Tagline | Site Name
1394         # and so on and so forth; it's just not practical
1395         mobj = re.search(r'<title>(.*)</title>', webpage)
1396         if mobj is None:
1397             self._downloader.trouble(u'ERROR: unable to extract title')
1398             return
1399         video_title = mobj.group(1)
1400
1401         # video uploader is domain name
1402         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1403         if mobj is None:
1404             self._downloader.trouble(u'ERROR: unable to extract title')
1405             return
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(InfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422     _max_youtube_results = 1000
1423     IE_NAME = u'youtube:search'
1424
1425     def __init__(self, downloader=None):
1426         InfoExtractor.__init__(self, downloader)
1427
1428     def report_download_page(self, query, pagenum):
1429         """Report attempt to download search page with given number."""
1430         query = query.decode(preferredencoding())
1431         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1432
1433     def _real_extract(self, query):
1434         mobj = re.match(self._VALID_URL, query)
1435         if mobj is None:
1436             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1437             return
1438
1439         prefix, query = query.split(':')
1440         prefix = prefix[8:]
1441         query = query.encode('utf-8')
1442         if prefix == '':
1443             self._download_n_results(query, 1)
1444             return
1445         elif prefix == 'all':
1446             self._download_n_results(query, self._max_youtube_results)
1447             return
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 self._download_n_results(query, n)
1458                 return
1459             except ValueError: # parsing prefix as integer fails
1460                 self._download_n_results(query, 1)
1461                 return
1462
1463     def _download_n_results(self, query, n):
1464         """Downloads a specified number of results for a query"""
1465
1466         video_ids = []
1467         pagenum = 0
1468         limit = n
1469
1470         while (50 * pagenum) < limit:
1471             self.report_download_page(query, pagenum+1)
1472             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473             request = compat_urllib_request.Request(result_url)
1474             try:
1475                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1478                 return
1479             api_response = json.loads(data)['data']
1480
1481             new_ids = list(video['id'] for video in api_response['items'])
1482             video_ids += new_ids
1483
1484             limit = min(n, api_response['totalItems'])
1485             pagenum += 1
1486
1487         if len(video_ids) > n:
1488             video_ids = video_ids[:n]
1489         for id in video_ids:
1490             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1491         return
1492
1493
1494 class GoogleSearchIE(InfoExtractor):
1495     """Information Extractor for Google Video search queries."""
1496     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1497     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1498     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1499     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1500     _max_google_results = 1000
1501     IE_NAME = u'video.google:search'
1502
1503     def __init__(self, downloader=None):
1504         InfoExtractor.__init__(self, downloader)
1505
1506     def report_download_page(self, query, pagenum):
1507         """Report attempt to download playlist page with given number."""
1508         query = query.decode(preferredencoding())
1509         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1510
1511     def _real_extract(self, query):
1512         mobj = re.match(self._VALID_URL, query)
1513         if mobj is None:
1514             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1515             return
1516
1517         prefix, query = query.split(':')
1518         prefix = prefix[8:]
1519         query = query.encode('utf-8')
1520         if prefix == '':
1521             self._download_n_results(query, 1)
1522             return
1523         elif prefix == 'all':
1524             self._download_n_results(query, self._max_google_results)
1525             return
1526         else:
1527             try:
1528                 n = int(prefix)
1529                 if n <= 0:
1530                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1531                     return
1532                 elif n > self._max_google_results:
1533                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534                     n = self._max_google_results
1535                 self._download_n_results(query, n)
1536                 return
1537             except ValueError: # parsing prefix as integer fails
1538                 self._download_n_results(query, 1)
1539                 return
1540
1541     def _download_n_results(self, query, n):
1542         """Downloads a specified number of results for a query"""
1543
1544         video_ids = []
1545         pagenum = 0
1546
1547         while True:
1548             self.report_download_page(query, pagenum)
1549             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550             request = compat_urllib_request.Request(result_url)
1551             try:
1552                 page = compat_urllib_request.urlopen(request).read()
1553             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1555                 return
1556
1557             # Extract video identifiers
1558             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559                 video_id = mobj.group(1)
1560                 if video_id not in video_ids:
1561                     video_ids.append(video_id)
1562                     if len(video_ids) == n:
1563                         # Specified n videos reached
1564                         for id in video_ids:
1565                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1566                         return
1567
1568             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569                 for id in video_ids:
1570                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571                 return
1572
1573             pagenum = pagenum + 1
1574
1575
1576 class YahooSearchIE(InfoExtractor):
1577     """Information Extractor for Yahoo! Video search queries."""
1578
1579     _WORKING = False
1580     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583     _MORE_PAGES_INDICATOR = r'\s*Next'
1584     _max_yahoo_results = 1000
1585     IE_NAME = u'video.yahoo:search'
1586
1587     def __init__(self, downloader=None):
1588         InfoExtractor.__init__(self, downloader)
1589
1590     def report_download_page(self, query, pagenum):
1591         """Report attempt to download playlist page with given number."""
1592         query = query.decode(preferredencoding())
1593         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1594
1595     def _real_extract(self, query):
1596         mobj = re.match(self._VALID_URL, query)
1597         if mobj is None:
1598             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1599             return
1600
1601         prefix, query = query.split(':')
1602         prefix = prefix[8:]
1603         query = query.encode('utf-8')
1604         if prefix == '':
1605             self._download_n_results(query, 1)
1606             return
1607         elif prefix == 'all':
1608             self._download_n_results(query, self._max_yahoo_results)
1609             return
1610         else:
1611             try:
1612                 n = int(prefix)
1613                 if n <= 0:
1614                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1615                     return
1616                 elif n > self._max_yahoo_results:
1617                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1618                     n = self._max_yahoo_results
1619                 self._download_n_results(query, n)
1620                 return
1621             except ValueError: # parsing prefix as integer fails
1622                 self._download_n_results(query, 1)
1623                 return
1624
1625     def _download_n_results(self, query, n):
1626         """Downloads a specified number of results for a query"""
1627
1628         video_ids = []
1629         already_seen = set()
1630         pagenum = 1
1631
1632         while True:
1633             self.report_download_page(query, pagenum)
1634             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1635             request = compat_urllib_request.Request(result_url)
1636             try:
1637                 page = compat_urllib_request.urlopen(request).read()
1638             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1639                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1640                 return
1641
1642             # Extract video identifiers
1643             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1644                 video_id = mobj.group(1)
1645                 if video_id not in already_seen:
1646                     video_ids.append(video_id)
1647                     already_seen.add(video_id)
1648                     if len(video_ids) == n:
1649                         # Specified n videos reached
1650                         for id in video_ids:
1651                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1652                         return
1653
1654             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1655                 for id in video_ids:
1656                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657                 return
1658
1659             pagenum = pagenum + 1
1660
1661
1662 class YoutubePlaylistIE(InfoExtractor):
1663     """Information Extractor for YouTube playlists."""
1664
1665     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1666     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1667     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1668     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669     IE_NAME = u'youtube:playlist'
1670
1671     def __init__(self, downloader=None):
1672         InfoExtractor.__init__(self, downloader)
1673
1674     def report_download_page(self, playlist_id, pagenum):
1675         """Report attempt to download playlist page with given number."""
1676         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1677
1678     def _real_extract(self, url):
1679         # Extract playlist id
1680         mobj = re.match(self._VALID_URL, url)
1681         if mobj is None:
1682             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1683             return
1684
1685         # Single video case
1686         if mobj.group(3) is not None:
1687             self._downloader.download([mobj.group(3)])
1688             return
1689
1690         # Download playlist pages
1691         # prefix is 'p' as default for playlists but there are other types that need extra care
1692         playlist_prefix = mobj.group(1)
1693         if playlist_prefix == 'a':
1694             playlist_access = 'artist'
1695         else:
1696             playlist_prefix = 'p'
1697             playlist_access = 'view_play_list'
1698         playlist_id = mobj.group(2)
1699         video_ids = []
1700         pagenum = 1
1701
1702         while True:
1703             self.report_download_page(playlist_id, pagenum)
1704             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1705             request = compat_urllib_request.Request(url)
1706             try:
1707                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1708             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1709                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1710                 return
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1715                 if mobj.group(1) not in ids_in_page:
1716                     ids_in_page.append(mobj.group(1))
1717             video_ids.extend(ids_in_page)
1718
1719             if self._MORE_PAGES_INDICATOR not in page:
1720                 break
1721             pagenum = pagenum + 1
1722
1723         total = len(video_ids)
1724
1725         playliststart = self._downloader.params.get('playliststart', 1) - 1
1726         playlistend = self._downloader.params.get('playlistend', -1)
1727         if playlistend == -1:
1728             video_ids = video_ids[playliststart:]
1729         else:
1730             video_ids = video_ids[playliststart:playlistend]
1731
1732         if len(video_ids) == total:
1733             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1734         else:
1735             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1736
1737         for id in video_ids:
1738             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739         return
1740
1741
1742 class YoutubeChannelIE(InfoExtractor):
1743     """Information Extractor for YouTube channels."""
1744
1745     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1748     IE_NAME = u'youtube:channel'
1749
1750     def report_download_page(self, channel_id, pagenum):
1751         """Report attempt to download channel page with given number."""
1752         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753
1754     def _real_extract(self, url):
1755         # Extract channel id
1756         mobj = re.match(self._VALID_URL, url)
1757         if mobj is None:
1758             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759             return
1760
1761         # Download channel pages
1762         channel_id = mobj.group(1)
1763         video_ids = []
1764         pagenum = 1
1765
1766         while True:
1767             self.report_download_page(channel_id, pagenum)
1768             url = self._TEMPLATE_URL % (channel_id, pagenum)
1769             request = compat_urllib_request.Request(url)
1770             try:
1771                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1772             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774                 return
1775
1776             # Extract video identifiers
1777             ids_in_page = []
1778             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779                 if mobj.group(1) not in ids_in_page:
1780                     ids_in_page.append(mobj.group(1))
1781             video_ids.extend(ids_in_page)
1782
1783             if self._MORE_PAGES_INDICATOR not in page:
1784                 break
1785             pagenum = pagenum + 1
1786
1787         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1788
1789         for id in video_ids:
1790             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1791         return
1792
1793
1794 class YoutubeUserIE(InfoExtractor):
1795     """Information Extractor for YouTube users."""
1796
1797     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1798     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1799     _GDATA_PAGE_SIZE = 50
1800     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1801     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1802     IE_NAME = u'youtube:user'
1803
1804     def __init__(self, downloader=None):
1805         InfoExtractor.__init__(self, downloader)
1806
1807     def report_download_page(self, username, start_index):
1808         """Report attempt to download user page."""
1809         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1810                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1811
1812     def _real_extract(self, url):
1813         # Extract username
1814         mobj = re.match(self._VALID_URL, url)
1815         if mobj is None:
1816             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1817             return
1818
1819         username = mobj.group(1)
1820
1821         # Download video ids using YouTube Data API. Result size per
1822         # query is limited (currently to 50 videos) so we need to query
1823         # page by page until there are no video ids - it means we got
1824         # all of them.
1825
1826         video_ids = []
1827         pagenum = 0
1828
1829         while True:
1830             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1831             self.report_download_page(username, start_index)
1832
1833             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1834
1835             try:
1836                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1839                 return
1840
1841             # Extract video identifiers
1842             ids_in_page = []
1843
1844             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845                 if mobj.group(1) not in ids_in_page:
1846                     ids_in_page.append(mobj.group(1))
1847
1848             video_ids.extend(ids_in_page)
1849
1850             # A little optimization - if current page is not
1851             # "full", ie. does not contain PAGE_SIZE video ids then
1852             # we can assume that this page is the last one - there
1853             # are no more ids on further pages - no need to query
1854             # again.
1855
1856             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1857                 break
1858
1859             pagenum += 1
1860
1861         all_ids_count = len(video_ids)
1862         playliststart = self._downloader.params.get('playliststart', 1) - 1
1863         playlistend = self._downloader.params.get('playlistend', -1)
1864
1865         if playlistend == -1:
1866             video_ids = video_ids[playliststart:]
1867         else:
1868             video_ids = video_ids[playliststart:playlistend]
1869
1870         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1871                 (username, all_ids_count, len(video_ids)))
1872
1873         for video_id in video_ids:
1874             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1875
1876
1877 class BlipTVUserIE(InfoExtractor):
1878     """Information Extractor for blip.tv users."""
1879
1880     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1881     _PAGE_SIZE = 12
1882     IE_NAME = u'blip.tv:user'
1883
1884     def __init__(self, downloader=None):
1885         InfoExtractor.__init__(self, downloader)
1886
1887     def report_download_page(self, username, pagenum):
1888         """Report attempt to download user page."""
1889         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1890                 (self.IE_NAME, username, pagenum))
1891
1892     def _real_extract(self, url):
1893         # Extract username
1894         mobj = re.match(self._VALID_URL, url)
1895         if mobj is None:
1896             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1897             return
1898
1899         username = mobj.group(1)
1900
1901         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1902
1903         request = compat_urllib_request.Request(url)
1904
1905         try:
1906             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907             mobj = re.search(r'data-users-id="([^"]+)"', page)
1908             page_base = page_base % mobj.group(1)
1909         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1911             return
1912
1913
1914         # Download video ids using BlipTV Ajax calls. Result size per
1915         # query is limited (currently to 12 videos) so we need to query
1916         # page by page until there are no video ids - it means we got
1917         # all of them.
1918
1919         video_ids = []
1920         pagenum = 1
1921
1922         while True:
1923             self.report_download_page(username, pagenum)
1924             url = page_base + "&page=" + str(pagenum)
1925             request = compat_urllib_request.Request( url )
1926             try:
1927                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1928             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1930                 return
1931
1932             # Extract video identifiers
1933             ids_in_page = []
1934
1935             for mobj in re.finditer(r'href="/([^"]+)"', page):
1936                 if mobj.group(1) not in ids_in_page:
1937                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1938
1939             video_ids.extend(ids_in_page)
1940
1941             # A little optimization - if current page is not
1942             # "full", ie. does not contain PAGE_SIZE video ids then
1943             # we can assume that this page is the last one - there
1944             # are no more ids on further pages - no need to query
1945             # again.
1946
1947             if len(ids_in_page) < self._PAGE_SIZE:
1948                 break
1949
1950             pagenum += 1
1951
1952         all_ids_count = len(video_ids)
1953         playliststart = self._downloader.params.get('playliststart', 1) - 1
1954         playlistend = self._downloader.params.get('playlistend', -1)
1955
1956         if playlistend == -1:
1957             video_ids = video_ids[playliststart:]
1958         else:
1959             video_ids = video_ids[playliststart:playlistend]
1960
1961         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1962                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1963
1964         for video_id in video_ids:
1965             self._downloader.download([u'http://blip.tv/'+video_id])
1966
1967
1968 class DepositFilesIE(InfoExtractor):
1969     """Information extractor for depositfiles.com"""
1970
1971     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1972
1973     def report_download_webpage(self, file_id):
1974         """Report webpage download."""
1975         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1976
1977     def report_extraction(self, file_id):
1978         """Report information extraction."""
1979         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1980
1981     def _real_extract(self, url):
1982         file_id = url.split('/')[-1]
1983         # Rebuild url in english locale
1984         url = 'http://depositfiles.com/en/files/' + file_id
1985
1986         # Retrieve file webpage with 'Free download' button pressed
1987         free_download_indication = { 'gateway_result' : '1' }
1988         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1989         try:
1990             self.report_download_webpage(file_id)
1991             webpage = compat_urllib_request.urlopen(request).read()
1992         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1993             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1994             return
1995
1996         # Search for the real file URL
1997         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1998         if (mobj is None) or (mobj.group(1) is None):
1999             # Try to figure out reason of the error.
2000             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2001             if (mobj is not None) and (mobj.group(1) is not None):
2002                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2003                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2004             else:
2005                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2006             return
2007
2008         file_url = mobj.group(1)
2009         file_extension = os.path.splitext(file_url)[1][1:]
2010
2011         # Search for file title
2012         mobj = re.search(r'<b title="(.*?)">', webpage)
2013         if mobj is None:
2014             self._downloader.trouble(u'ERROR: unable to extract title')
2015             return
2016         file_title = mobj.group(1).decode('utf-8')
2017
2018         return [{
2019             'id':       file_id.decode('utf-8'),
2020             'url':      file_url.decode('utf-8'),
2021             'uploader': None,
2022             'upload_date':  None,
2023             'title':    file_title,
2024             'ext':      file_extension.decode('utf-8'),
2025         }]
2026
2027
2028 class FacebookIE(InfoExtractor):
2029     """Information Extractor for Facebook"""
2030
2031     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2032     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2033     _NETRC_MACHINE = 'facebook'
2034     IE_NAME = u'facebook'
2035
2036     def report_login(self):
2037         """Report attempt to log in."""
2038         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2039
2040     def _real_initialize(self):
2041         if self._downloader is None:
2042             return
2043
2044         useremail = None
2045         password = None
2046         downloader_params = self._downloader.params
2047
2048         # Attempt to use provided username and password or .netrc data
2049         if downloader_params.get('username', None) is not None:
2050             useremail = downloader_params['username']
2051             password = downloader_params['password']
2052         elif downloader_params.get('usenetrc', False):
2053             try:
2054                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2055                 if info is not None:
2056                     useremail = info[0]
2057                     password = info[2]
2058                 else:
2059                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2060             except (IOError, netrc.NetrcParseError) as err:
2061                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2062                 return
2063
2064         if useremail is None:
2065             return
2066
2067         # Log in
2068         login_form = {
2069             'email': useremail,
2070             'pass': password,
2071             'login': 'Log+In'
2072             }
2073         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2074         try:
2075             self.report_login()
2076             login_results = compat_urllib_request.urlopen(request).read()
2077             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2078                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2079                 return
2080         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2082             return
2083
2084     def _real_extract(self, url):
2085         mobj = re.match(self._VALID_URL, url)
2086         if mobj is None:
2087             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2088             return
2089         video_id = mobj.group('ID')
2090
2091         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2092         webpage = self._download_webpage(url, video_id)
2093
2094         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2095         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2096         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2097         if not m:
2098             raise ExtractorError(u'Cannot parse data')
2099         data = dict(json.loads(m.group(1)))
2100         params_raw = compat_urllib_parse.unquote(data['params'])
2101         params = json.loads(params_raw)
2102         video_url = params['hd_src']
2103         if not video_url:
2104             video_url = params['sd_src']
2105         if not video_url:
2106             raise ExtractorError(u'Cannot find video URL')
2107         video_duration = int(params['video_duration'])
2108
2109         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2110         if not m:
2111             raise ExtractorError(u'Cannot find title in webpage')
2112         video_title = unescapeHTML(m.group(1))
2113
2114         info = {
2115             'id': video_id,
2116             'title': video_title,
2117             'url': video_url,
2118             'ext': 'mp4',
2119             'duration': video_duration,
2120             'thumbnail': params['thumbnail_src'],
2121         }
2122         return [info]
2123
2124
2125 class BlipTVIE(InfoExtractor):
2126     """Information extractor for blip.tv"""
2127
2128     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2129     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2130     IE_NAME = u'blip.tv'
2131
2132     def report_extraction(self, file_id):
2133         """Report information extraction."""
2134         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2135
2136     def report_direct_download(self, title):
2137         """Report information extraction."""
2138         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2139
2140     def _real_extract(self, url):
2141         mobj = re.match(self._VALID_URL, url)
2142         if mobj is None:
2143             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2144             return
2145
2146         if '?' in url:
2147             cchar = '&'
2148         else:
2149             cchar = '?'
2150         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2151         request = compat_urllib_request.Request(json_url)
2152         request.add_header('User-Agent', 'iTunes/10.6.1')
2153         self.report_extraction(mobj.group(1))
2154         info = None
2155         try:
2156             urlh = compat_urllib_request.urlopen(request)
2157             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2158                 basename = url.split('/')[-1]
2159                 title,ext = os.path.splitext(basename)
2160                 title = title.decode('UTF-8')
2161                 ext = ext.replace('.', '')
2162                 self.report_direct_download(title)
2163                 info = {
2164                     'id': title,
2165                     'url': url,
2166                     'uploader': None,
2167                     'upload_date': None,
2168                     'title': title,
2169                     'ext': ext,
2170                     'urlhandle': urlh
2171                 }
2172         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2173             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2174         if info is None: # Regular URL
2175             try:
2176                 json_code_bytes = urlh.read()
2177                 json_code = json_code_bytes.decode('utf-8')
2178             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2179                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2180                 return
2181
2182             try:
2183                 json_data = json.loads(json_code)
2184                 if 'Post' in json_data:
2185                     data = json_data['Post']
2186                 else:
2187                     data = json_data
2188
2189                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2190                 video_url = data['media']['url']
2191                 umobj = re.match(self._URL_EXT, video_url)
2192                 if umobj is None:
2193                     raise ValueError('Can not determine filename extension')
2194                 ext = umobj.group(1)
2195
2196                 info = {
2197                     'id': data['item_id'],
2198                     'url': video_url,
2199                     'uploader': data['display_name'],
2200                     'upload_date': upload_date,
2201                     'title': data['title'],
2202                     'ext': ext,
2203                     'format': data['media']['mimeType'],
2204                     'thumbnail': data['thumbnailUrl'],
2205                     'description': data['description'],
2206                     'player_url': data['embedUrl'],
2207                     'user_agent': 'iTunes/10.6.1',
2208                 }
2209             except (ValueError,KeyError) as err:
2210                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2211                 return
2212
2213         return [info]
2214
2215
2216 class MyVideoIE(InfoExtractor):
2217     """Information Extractor for myvideo.de."""
2218
2219     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2220     IE_NAME = u'myvideo'
2221
2222     def __init__(self, downloader=None):
2223         InfoExtractor.__init__(self, downloader)
2224
2225     def report_extraction(self, video_id):
2226         """Report information extraction."""
2227         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2228
2229     def _real_extract(self,url):
2230         mobj = re.match(self._VALID_URL, url)
2231         if mobj is None:
2232             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2233             return
2234
2235         video_id = mobj.group(1)
2236
2237         # Get video webpage
2238         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2239         webpage = self._download_webpage(webpage_url, video_id)
2240
2241         self.report_extraction(video_id)
2242         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2243                  webpage)
2244         if mobj is None:
2245             self._downloader.trouble(u'ERROR: unable to extract media URL')
2246             return
2247         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2248
2249         mobj = re.search('<title>([^<]+)</title>', webpage)
2250         if mobj is None:
2251             self._downloader.trouble(u'ERROR: unable to extract title')
2252             return
2253
2254         video_title = mobj.group(1)
2255
2256         return [{
2257             'id':       video_id,
2258             'url':      video_url,
2259             'uploader': None,
2260             'upload_date':  None,
2261             'title':    video_title,
2262             'ext':      u'flv',
2263         }]
2264
2265 class ComedyCentralIE(InfoExtractor):
2266     """Information extractor for The Daily Show and Colbert Report """
2267
2268     # urls can be abbreviations like :thedailyshow or :colbert
2269     # urls for episodes like:
2270     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2271     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2272     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2273     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2274                       |(https?://)?(www\.)?
2275                           (?P<showname>thedailyshow|colbertnation)\.com/
2276                          (full-episodes/(?P<episode>.*)|
2277                           (?P<clip>
2278                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2279                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2280                      $"""
2281
2282     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2283
2284     _video_extensions = {
2285         '3500': 'mp4',
2286         '2200': 'mp4',
2287         '1700': 'mp4',
2288         '1200': 'mp4',
2289         '750': 'mp4',
2290         '400': 'mp4',
2291     }
2292     _video_dimensions = {
2293         '3500': '1280x720',
2294         '2200': '960x540',
2295         '1700': '768x432',
2296         '1200': '640x360',
2297         '750': '512x288',
2298         '400': '384x216',
2299     }
2300
2301     def suitable(self, url):
2302         """Receives a URL and returns True if suitable for this IE."""
2303         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2304
2305     def report_extraction(self, episode_id):
2306         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2307
2308     def report_config_download(self, episode_id, media_id):
2309         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2310
2311     def report_index_download(self, episode_id):
2312         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2313
2314     def _print_formats(self, formats):
2315         print('Available formats:')
2316         for x in formats:
2317             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2318
2319
2320     def _real_extract(self, url):
2321         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2322         if mobj is None:
2323             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2324             return
2325
2326         if mobj.group('shortname'):
2327             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2328                 url = u'http://www.thedailyshow.com/full-episodes/'
2329             else:
2330                 url = u'http://www.colbertnation.com/full-episodes/'
2331             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2332             assert mobj is not None
2333
2334         if mobj.group('clip'):
2335             if mobj.group('showname') == 'thedailyshow':
2336                 epTitle = mobj.group('tdstitle')
2337             else:
2338                 epTitle = mobj.group('cntitle')
2339             dlNewest = False
2340         else:
2341             dlNewest = not mobj.group('episode')
2342             if dlNewest:
2343                 epTitle = mobj.group('showname')
2344             else:
2345                 epTitle = mobj.group('episode')
2346
2347         req = compat_urllib_request.Request(url)
2348         self.report_extraction(epTitle)
2349         try:
2350             htmlHandle = compat_urllib_request.urlopen(req)
2351             html = htmlHandle.read()
2352             webpage = html.decode('utf-8')
2353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2355             return
2356         if dlNewest:
2357             url = htmlHandle.geturl()
2358             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2359             if mobj is None:
2360                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2361                 return
2362             if mobj.group('episode') == '':
2363                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2364                 return
2365             epTitle = mobj.group('episode')
2366
2367         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2368
2369         if len(mMovieParams) == 0:
2370             # The Colbert Report embeds the information in a without
2371             # a URL prefix; so extract the alternate reference
2372             # and then add the URL prefix manually.
2373
2374             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2375             if len(altMovieParams) == 0:
2376                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2377                 return
2378             else:
2379                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2380
2381         uri = mMovieParams[0][1]
2382         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2383         self.report_index_download(epTitle)
2384         try:
2385             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2387             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2388             return
2389
2390         results = []
2391
2392         idoc = xml.etree.ElementTree.fromstring(indexXml)
2393         itemEls = idoc.findall('.//item')
2394         for partNum,itemEl in enumerate(itemEls):
2395             mediaId = itemEl.findall('./guid')[0].text
2396             shortMediaId = mediaId.split(':')[-1]
2397             showId = mediaId.split(':')[-2].replace('.com', '')
2398             officialTitle = itemEl.findall('./title')[0].text
2399             officialDate = itemEl.findall('./pubDate')[0].text
2400
2401             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2402                         compat_urllib_parse.urlencode({'uri': mediaId}))
2403             configReq = compat_urllib_request.Request(configUrl)
2404             self.report_config_download(epTitle, shortMediaId)
2405             try:
2406                 configXml = compat_urllib_request.urlopen(configReq).read()
2407             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2409                 return
2410
2411             cdoc = xml.etree.ElementTree.fromstring(configXml)
2412             turls = []
2413             for rendition in cdoc.findall('.//rendition'):
2414                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2415                 turls.append(finfo)
2416
2417             if len(turls) == 0:
2418                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2419                 continue
2420
2421             if self._downloader.params.get('listformats', None):
2422                 self._print_formats([i[0] for i in turls])
2423                 return
2424
2425             # For now, just pick the highest bitrate
2426             format,rtmp_video_url = turls[-1]
2427
2428             # Get the format arg from the arg stream
2429             req_format = self._downloader.params.get('format', None)
2430
2431             # Select format if we can find one
2432             for f,v in turls:
2433                 if f == req_format:
2434                     format, rtmp_video_url = f, v
2435                     break
2436
2437             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2438             if not m:
2439                 raise ExtractorError(u'Cannot transform RTMP url')
2440             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2441             video_url = base + m.group('finalid')
2442
2443             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2444             info = {
2445                 'id': shortMediaId,
2446                 'url': video_url,
2447                 'uploader': showId,
2448                 'upload_date': officialDate,
2449                 'title': effTitle,
2450                 'ext': 'mp4',
2451                 'format': format,
2452                 'thumbnail': None,
2453                 'description': officialTitle,
2454             }
2455             results.append(info)
2456
2457         return results
2458
2459
2460 class EscapistIE(InfoExtractor):
2461     """Information extractor for The Escapist """
2462
2463     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2464     IE_NAME = u'escapist'
2465
2466     def report_extraction(self, showName):
2467         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2468
2469     def report_config_download(self, showName):
2470         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2471
2472     def _real_extract(self, url):
2473         mobj = re.match(self._VALID_URL, url)
2474         if mobj is None:
2475             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2476             return
2477         showName = mobj.group('showname')
2478         videoId = mobj.group('episode')
2479
2480         self.report_extraction(showName)
2481         try:
2482             webPage = compat_urllib_request.urlopen(url)
2483             webPageBytes = webPage.read()
2484             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2485             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2486         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2488             return
2489
2490         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2491         description = unescapeHTML(descMatch.group(1))
2492         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2493         imgUrl = unescapeHTML(imgMatch.group(1))
2494         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2495         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2496         configUrlMatch = re.search('config=(.*)$', playerUrl)
2497         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2498
2499         self.report_config_download(showName)
2500         try:
2501             configJSON = compat_urllib_request.urlopen(configUrl)
2502             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2503             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2504         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2505             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2506             return
2507
2508         # Technically, it's JavaScript, not JSON
2509         configJSON = configJSON.replace("'", '"')
2510
2511         try:
2512             config = json.loads(configJSON)
2513         except (ValueError,) as err:
2514             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2515             return
2516
2517         playlist = config['playlist']
2518         videoUrl = playlist[1]['url']
2519
2520         info = {
2521             'id': videoId,
2522             'url': videoUrl,
2523             'uploader': showName,
2524             'upload_date': None,
2525             'title': showName,
2526             'ext': 'flv',
2527             'thumbnail': imgUrl,
2528             'description': description,
2529             'player_url': playerUrl,
2530         }
2531
2532         return [info]
2533
2534 class CollegeHumorIE(InfoExtractor):
2535     """Information extractor for collegehumor.com"""
2536
2537     _WORKING = False
2538     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2539     IE_NAME = u'collegehumor'
2540
2541     def report_manifest(self, video_id):
2542         """Report information extraction."""
2543         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2544
2545     def report_extraction(self, video_id):
2546         """Report information extraction."""
2547         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2548
2549     def _real_extract(self, url):
2550         mobj = re.match(self._VALID_URL, url)
2551         if mobj is None:
2552             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2553             return
2554         video_id = mobj.group('videoid')
2555
2556         info = {
2557             'id': video_id,
2558             'uploader': None,
2559             'upload_date': None,
2560         }
2561
2562         self.report_extraction(video_id)
2563         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2564         try:
2565             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2566         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2567             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2568             return
2569
2570         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2571         try:
2572             videoNode = mdoc.findall('./video')[0]
2573             info['description'] = videoNode.findall('./description')[0].text
2574             info['title'] = videoNode.findall('./caption')[0].text
2575             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2576             manifest_url = videoNode.findall('./file')[0].text
2577         except IndexError:
2578             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2579             return
2580
2581         manifest_url += '?hdcore=2.10.3'
2582         self.report_manifest(video_id)
2583         try:
2584             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2587             return
2588
2589         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2590         try:
2591             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2592             node_id = media_node.attrib['url']
2593             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2594         except IndexError as err:
2595             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2596             return
2597
2598         url_pr = compat_urllib_parse_urlparse(manifest_url)
2599         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2600
2601         info['url'] = url
2602         info['ext'] = 'f4f'
2603         return [info]
2604
2605
2606 class XVideosIE(InfoExtractor):
2607     """Information extractor for xvideos.com"""
2608
2609     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2610     IE_NAME = u'xvideos'
2611
2612     def report_extraction(self, video_id):
2613         """Report information extraction."""
2614         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2615
2616     def _real_extract(self, url):
2617         mobj = re.match(self._VALID_URL, url)
2618         if mobj is None:
2619             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2620             return
2621         video_id = mobj.group(1)
2622
2623         webpage = self._download_webpage(url, video_id)
2624
2625         self.report_extraction(video_id)
2626
2627
2628         # Extract video URL
2629         mobj = re.search(r'flv_url=(.+?)&', webpage)
2630         if mobj is None:
2631             self._downloader.trouble(u'ERROR: unable to extract video url')
2632             return
2633         video_url = compat_urllib_parse.unquote(mobj.group(1))
2634
2635
2636         # Extract title
2637         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2638         if mobj is None:
2639             self._downloader.trouble(u'ERROR: unable to extract video title')
2640             return
2641         video_title = mobj.group(1)
2642
2643
2644         # Extract video thumbnail
2645         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2646         if mobj is None:
2647             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2648             return
2649         video_thumbnail = mobj.group(0)
2650
2651         info = {
2652             'id': video_id,
2653             'url': video_url,
2654             'uploader': None,
2655             'upload_date': None,
2656             'title': video_title,
2657             'ext': 'flv',
2658             'thumbnail': video_thumbnail,
2659             'description': None,
2660         }
2661
2662         return [info]
2663
2664
2665 class SoundcloudIE(InfoExtractor):
2666     """Information extractor for soundcloud.com
2667        To access the media, the uid of the song and a stream token
2668        must be extracted from the page source and the script must make
2669        a request to media.soundcloud.com/crossdomain.xml. Then
2670        the media can be grabbed by requesting from an url composed
2671        of the stream token and uid
2672      """
2673
2674     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2675     IE_NAME = u'soundcloud'
2676
2677     def __init__(self, downloader=None):
2678         InfoExtractor.__init__(self, downloader)
2679
2680     def report_resolve(self, video_id):
2681         """Report information extraction."""
2682         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2683
2684     def report_extraction(self, video_id):
2685         """Report information extraction."""
2686         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2687
2688     def _real_extract(self, url):
2689         mobj = re.match(self._VALID_URL, url)
2690         if mobj is None:
2691             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2692             return
2693
2694         # extract uploader (which is in the url)
2695         uploader = mobj.group(1)
2696         # extract simple title (uploader + slug of song title)
2697         slug_title =  mobj.group(2)
2698         simple_title = uploader + u'-' + slug_title
2699
2700         self.report_resolve('%s/%s' % (uploader, slug_title))
2701
2702         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2703         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2704         request = compat_urllib_request.Request(resolv_url)
2705         try:
2706             info_json_bytes = compat_urllib_request.urlopen(request).read()
2707             info_json = info_json_bytes.decode('utf-8')
2708         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2709             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2710             return
2711
2712         info = json.loads(info_json)
2713         video_id = info['id']
2714         self.report_extraction('%s/%s' % (uploader, slug_title))
2715
2716         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2717         request = compat_urllib_request.Request(streams_url)
2718         try:
2719             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2720             stream_json = stream_json_bytes.decode('utf-8')
2721         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2722             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2723             return
2724
2725         streams = json.loads(stream_json)
2726         mediaURL = streams['http_mp3_128_url']
2727
2728         return [{
2729             'id':       info['id'],
2730             'url':      mediaURL,
2731             'uploader': info['user']['username'],
2732             'upload_date':  info['created_at'],
2733             'title':    info['title'],
2734             'ext':      u'mp3',
2735             'description': info['description'],
2736         }]
2737
2738
2739 class InfoQIE(InfoExtractor):
2740     """Information extractor for infoq.com"""
2741     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2742
2743     def report_extraction(self, video_id):
2744         """Report information extraction."""
2745         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2746
2747     def _real_extract(self, url):
2748         mobj = re.match(self._VALID_URL, url)
2749         if mobj is None:
2750             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2751             return
2752
2753         webpage = self._download_webpage(url, video_id=url)
2754         self.report_extraction(url)
2755
2756         # Extract video URL
2757         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2758         if mobj is None:
2759             self._downloader.trouble(u'ERROR: unable to extract video url')
2760             return
2761         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2762         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2763
2764         # Extract title
2765         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2766         if mobj is None:
2767             self._downloader.trouble(u'ERROR: unable to extract video title')
2768             return
2769         video_title = mobj.group(1)
2770
2771         # Extract description
2772         video_description = u'No description available.'
2773         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2774         if mobj is not None:
2775             video_description = mobj.group(1)
2776
2777         video_filename = video_url.split('/')[-1]
2778         video_id, extension = video_filename.split('.')
2779
2780         info = {
2781             'id': video_id,
2782             'url': video_url,
2783             'uploader': None,
2784             'upload_date': None,
2785             'title': video_title,
2786             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2787             'thumbnail': None,
2788             'description': video_description,
2789         }
2790
2791         return [info]
2792
2793 class MixcloudIE(InfoExtractor):
2794     """Information extractor for www.mixcloud.com"""
2795
2796     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2797     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2798     IE_NAME = u'mixcloud'
2799
2800     def __init__(self, downloader=None):
2801         InfoExtractor.__init__(self, downloader)
2802
2803     def report_download_json(self, file_id):
2804         """Report JSON download."""
2805         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2806
2807     def report_extraction(self, file_id):
2808         """Report information extraction."""
2809         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2810
2811     def get_urls(self, jsonData, fmt, bitrate='best'):
2812         """Get urls from 'audio_formats' section in json"""
2813         file_url = None
2814         try:
2815             bitrate_list = jsonData[fmt]
2816             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2817                 bitrate = max(bitrate_list) # select highest
2818
2819             url_list = jsonData[fmt][bitrate]
2820         except TypeError: # we have no bitrate info.
2821             url_list = jsonData[fmt]
2822         return url_list
2823
2824     def check_urls(self, url_list):
2825         """Returns 1st active url from list"""
2826         for url in url_list:
2827             try:
2828                 compat_urllib_request.urlopen(url)
2829                 return url
2830             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2831                 url = None
2832
2833         return None
2834
2835     def _print_formats(self, formats):
2836         print('Available formats:')
2837         for fmt in formats.keys():
2838             for b in formats[fmt]:
2839                 try:
2840                     ext = formats[fmt][b][0]
2841                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2842                 except TypeError: # we have no bitrate info
2843                     ext = formats[fmt][0]
2844                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2845                     break
2846
2847     def _real_extract(self, url):
2848         mobj = re.match(self._VALID_URL, url)
2849         if mobj is None:
2850             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851             return
2852         # extract uploader & filename from url
2853         uploader = mobj.group(1).decode('utf-8')
2854         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2855
2856         # construct API request
2857         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2858         # retrieve .json file with links to files
2859         request = compat_urllib_request.Request(file_url)
2860         try:
2861             self.report_download_json(file_url)
2862             jsonData = compat_urllib_request.urlopen(request).read()
2863         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2865             return
2866
2867         # parse JSON
2868         json_data = json.loads(jsonData)
2869         player_url = json_data['player_swf_url']
2870         formats = dict(json_data['audio_formats'])
2871
2872         req_format = self._downloader.params.get('format', None)
2873         bitrate = None
2874
2875         if self._downloader.params.get('listformats', None):
2876             self._print_formats(formats)
2877             return
2878
2879         if req_format is None or req_format == 'best':
2880             for format_param in formats.keys():
2881                 url_list = self.get_urls(formats, format_param)
2882                 # check urls
2883                 file_url = self.check_urls(url_list)
2884                 if file_url is not None:
2885                     break # got it!
2886         else:
2887             if req_format not in formats:
2888                 self._downloader.trouble(u'ERROR: format is not available')
2889                 return
2890
2891             url_list = self.get_urls(formats, req_format)
2892             file_url = self.check_urls(url_list)
2893             format_param = req_format
2894
2895         return [{
2896             'id': file_id.decode('utf-8'),
2897             'url': file_url.decode('utf-8'),
2898             'uploader': uploader.decode('utf-8'),
2899             'upload_date': None,
2900             'title': json_data['name'],
2901             'ext': file_url.split('.')[-1].decode('utf-8'),
2902             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2903             'thumbnail': json_data['thumbnail_url'],
2904             'description': json_data['description'],
2905             'player_url': player_url.decode('utf-8'),
2906         }]
2907
2908 class StanfordOpenClassroomIE(InfoExtractor):
2909     """Information extractor for Stanford's Open ClassRoom"""
2910
2911     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2912     IE_NAME = u'stanfordoc'
2913
2914     def report_download_webpage(self, objid):
2915         """Report information extraction."""
2916         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2917
2918     def report_extraction(self, video_id):
2919         """Report information extraction."""
2920         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2921
2922     def _real_extract(self, url):
2923         mobj = re.match(self._VALID_URL, url)
2924         if mobj is None:
2925             raise ExtractorError(u'Invalid URL: %s' % url)
2926
2927         if mobj.group('course') and mobj.group('video'): # A specific video
2928             course = mobj.group('course')
2929             video = mobj.group('video')
2930             info = {
2931                 'id': course + '_' + video,
2932                 'uploader': None,
2933                 'upload_date': None,
2934             }
2935
2936             self.report_extraction(info['id'])
2937             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2938             xmlUrl = baseUrl + video + '.xml'
2939             try:
2940                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2941             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2942                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2943                 return
2944             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2945             try:
2946                 info['title'] = mdoc.findall('./title')[0].text
2947                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2948             except IndexError:
2949                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2950                 return
2951             info['ext'] = info['url'].rpartition('.')[2]
2952             return [info]
2953         elif mobj.group('course'): # A course page
2954             course = mobj.group('course')
2955             info = {
2956                 'id': course,
2957                 'type': 'playlist',
2958                 'uploader': None,
2959                 'upload_date': None,
2960             }
2961
2962             coursepage = self._download_webpage(url, info['id'],
2963                                         note='Downloading course info page',
2964                                         errnote='Unable to download course info page')
2965
2966             m = re.search('<h1>([^<]+)</h1>', coursepage)
2967             if m:
2968                 info['title'] = unescapeHTML(m.group(1))
2969             else:
2970                 info['title'] = info['id']
2971
2972             m = re.search('<description>([^<]+)</description>', coursepage)
2973             if m:
2974                 info['description'] = unescapeHTML(m.group(1))
2975
2976             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2977             info['list'] = [
2978                 {
2979                     'type': 'reference',
2980                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2981                 }
2982                     for vpage in links]
2983             results = []
2984             for entry in info['list']:
2985                 assert entry['type'] == 'reference'
2986                 results += self.extract(entry['url'])
2987             return results
2988         else: # Root page
2989             info = {
2990                 'id': 'Stanford OpenClassroom',
2991                 'type': 'playlist',
2992                 'uploader': None,
2993                 'upload_date': None,
2994             }
2995
2996             self.report_download_webpage(info['id'])
2997             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2998             try:
2999                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3000             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3001                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3002                 return
3003
3004             info['title'] = info['id']
3005
3006             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3007             info['list'] = [
3008                 {
3009                     'type': 'reference',
3010                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3011                 }
3012                     for cpage in links]
3013
3014             results = []
3015             for entry in info['list']:
3016                 assert entry['type'] == 'reference'
3017                 results += self.extract(entry['url'])
3018             return results
3019
3020 class MTVIE(InfoExtractor):
3021     """Information extractor for MTV.com"""
3022
3023     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3024     IE_NAME = u'mtv'
3025
3026     def report_extraction(self, video_id):
3027         """Report information extraction."""
3028         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3029
3030     def _real_extract(self, url):
3031         mobj = re.match(self._VALID_URL, url)
3032         if mobj is None:
3033             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3034             return
3035         if not mobj.group('proto'):
3036             url = 'http://' + url
3037         video_id = mobj.group('videoid')
3038
3039         webpage = self._download_webpage(url, video_id)
3040
3041         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3042         if mobj is None:
3043             self._downloader.trouble(u'ERROR: unable to extract song name')
3044             return
3045         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3046         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3047         if mobj is None:
3048             self._downloader.trouble(u'ERROR: unable to extract performer')
3049             return
3050         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3051         video_title = performer + ' - ' + song_name
3052
3053         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3054         if mobj is None:
3055             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3056             return
3057         mtvn_uri = mobj.group(1)
3058
3059         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3060         if mobj is None:
3061             self._downloader.trouble(u'ERROR: unable to extract content id')
3062             return
3063         content_id = mobj.group(1)
3064
3065         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3066         self.report_extraction(video_id)
3067         request = compat_urllib_request.Request(videogen_url)
3068         try:
3069             metadataXml = compat_urllib_request.urlopen(request).read()
3070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3071             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3072             return
3073
3074         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3075         renditions = mdoc.findall('.//rendition')
3076
3077         # For now, always pick the highest quality.
3078         rendition = renditions[-1]
3079
3080         try:
3081             _,_,ext = rendition.attrib['type'].partition('/')
3082             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3083             video_url = rendition.find('./src').text
3084         except KeyError:
3085             self._downloader.trouble('Invalid rendition field.')
3086             return
3087
3088         info = {
3089             'id': video_id,
3090             'url': video_url,
3091             'uploader': performer,
3092             'upload_date': None,
3093             'title': video_title,
3094             'ext': ext,
3095             'format': format,
3096         }
3097
3098         return [info]
3099
3100
3101 class YoukuIE(InfoExtractor):
3102     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3103
3104     def report_download_webpage(self, file_id):
3105         """Report webpage download."""
3106         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3107
3108     def report_extraction(self, file_id):
3109         """Report information extraction."""
3110         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3111
3112     def _gen_sid(self):
3113         nowTime = int(time.time() * 1000)
3114         random1 = random.randint(1000,1998)
3115         random2 = random.randint(1000,9999)
3116
3117         return "%d%d%d" %(nowTime,random1,random2)
3118
3119     def _get_file_ID_mix_string(self, seed):
3120         mixed = []
3121         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3122         seed = float(seed)
3123         for i in range(len(source)):
3124             seed  =  (seed * 211 + 30031 ) % 65536
3125             index  =  math.floor(seed / 65536 * len(source) )
3126             mixed.append(source[int(index)])
3127             source.remove(source[int(index)])
3128         #return ''.join(mixed)
3129         return mixed
3130
3131     def _get_file_id(self, fileId, seed):
3132         mixed = self._get_file_ID_mix_string(seed)
3133         ids = fileId.split('*')
3134         realId = []
3135         for ch in ids:
3136             if ch:
3137                 realId.append(mixed[int(ch)])
3138         return ''.join(realId)
3139
3140     def _real_extract(self, url):
3141         mobj = re.match(self._VALID_URL, url)
3142         if mobj is None:
3143             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3144             return
3145         video_id = mobj.group('ID')
3146
3147         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3148
3149         request = compat_urllib_request.Request(info_url, None, std_headers)
3150         try:
3151             self.report_download_webpage(video_id)
3152             jsondata = compat_urllib_request.urlopen(request).read()
3153         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3154             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3155             return
3156
3157         self.report_extraction(video_id)
3158         try:
3159             jsonstr = jsondata.decode('utf-8')
3160             config = json.loads(jsonstr)
3161
3162             video_title =  config['data'][0]['title']
3163             seed = config['data'][0]['seed']
3164
3165             format = self._downloader.params.get('format', None)
3166             supported_format = list(config['data'][0]['streamfileids'].keys())
3167
3168             if format is None or format == 'best':
3169                 if 'hd2' in supported_format:
3170                     format = 'hd2'
3171                 else:
3172                     format = 'flv'
3173                 ext = u'flv'
3174             elif format == 'worst':
3175                 format = 'mp4'
3176                 ext = u'mp4'
3177             else:
3178                 format = 'flv'
3179                 ext = u'flv'
3180
3181
3182             fileid = config['data'][0]['streamfileids'][format]
3183             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3184         except (UnicodeDecodeError, ValueError, KeyError):
3185             self._downloader.trouble(u'ERROR: unable to extract info section')
3186             return
3187
3188         files_info=[]
3189         sid = self._gen_sid()
3190         fileid = self._get_file_id(fileid, seed)
3191
3192         #column 8,9 of fileid represent the segment number
3193         #fileid[7:9] should be changed
3194         for index, key in enumerate(keys):
3195
3196             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3197             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3198
3199             info = {
3200                 'id': '%s_part%02d' % (video_id, index),
3201                 'url': download_url,
3202                 'uploader': None,
3203                 'upload_date': None,
3204                 'title': video_title,
3205                 'ext': ext,
3206             }
3207             files_info.append(info)
3208
3209         return files_info
3210
3211
3212 class XNXXIE(InfoExtractor):
3213     """Information extractor for xnxx.com"""
3214
3215     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3216     IE_NAME = u'xnxx'
3217     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3218     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3219     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3220
3221     def report_webpage(self, video_id):
3222         """Report information extraction"""
3223         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3224
3225     def report_extraction(self, video_id):
3226         """Report information extraction"""
3227         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3228
3229     def _real_extract(self, url):
3230         mobj = re.match(self._VALID_URL, url)
3231         if mobj is None:
3232             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3233             return
3234         video_id = mobj.group(1)
3235
3236         self.report_webpage(video_id)
3237
3238         # Get webpage content
3239         try:
3240             webpage_bytes = compat_urllib_request.urlopen(url).read()
3241             webpage = webpage_bytes.decode('utf-8')
3242         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3243             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3244             return
3245
3246         result = re.search(self.VIDEO_URL_RE, webpage)
3247         if result is None:
3248             self._downloader.trouble(u'ERROR: unable to extract video url')
3249             return
3250         video_url = compat_urllib_parse.unquote(result.group(1))
3251
3252         result = re.search(self.VIDEO_TITLE_RE, webpage)
3253         if result is None:
3254             self._downloader.trouble(u'ERROR: unable to extract video title')
3255             return
3256         video_title = result.group(1)
3257
3258         result = re.search(self.VIDEO_THUMB_RE, webpage)
3259         if result is None:
3260             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3261             return
3262         video_thumbnail = result.group(1)
3263
3264         return [{
3265             'id': video_id,
3266             'url': video_url,
3267             'uploader': None,
3268             'upload_date': None,
3269             'title': video_title,
3270             'ext': 'flv',
3271             'thumbnail': video_thumbnail,
3272             'description': None,
3273         }]
3274
3275
3276 class GooglePlusIE(InfoExtractor):
3277     """Information extractor for plus.google.com."""
3278
3279     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3280     IE_NAME = u'plus.google'
3281
3282     def __init__(self, downloader=None):
3283         InfoExtractor.__init__(self, downloader)
3284
3285     def report_extract_entry(self, url):
3286         """Report downloading extry"""
3287         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3288
3289     def report_date(self, upload_date):
3290         """Report downloading extry"""
3291         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3292
3293     def report_uploader(self, uploader):
3294         """Report downloading extry"""
3295         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3296
3297     def report_title(self, video_title):
3298         """Report downloading extry"""
3299         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3300
3301     def report_extract_vid_page(self, video_page):
3302         """Report information extraction."""
3303         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3304
3305     def _real_extract(self, url):
3306         # Extract id from URL
3307         mobj = re.match(self._VALID_URL, url)
3308         if mobj is None:
3309             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3310             return
3311
3312         post_url = mobj.group(0)
3313         video_id = mobj.group(1)
3314
3315         video_extension = 'flv'
3316
3317         # Step 1, Retrieve post webpage to extract further information
3318         self.report_extract_entry(post_url)
3319         request = compat_urllib_request.Request(post_url)
3320         try:
3321             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3323             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3324             return
3325
3326         # Extract update date
3327         upload_date = None
3328         pattern = 'title="Timestamp">(.*?)</a>'
3329         mobj = re.search(pattern, webpage)
3330         if mobj:
3331             upload_date = mobj.group(1)
3332             # Convert timestring to a format suitable for filename
3333             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3334             upload_date = upload_date.strftime('%Y%m%d')
3335         self.report_date(upload_date)
3336
3337         # Extract uploader
3338         uploader = None
3339         pattern = r'rel\="author".*?>(.*?)</a>'
3340         mobj = re.search(pattern, webpage)
3341         if mobj:
3342             uploader = mobj.group(1)
3343         self.report_uploader(uploader)
3344
3345         # Extract title
3346         # Get the first line for title
3347         video_title = u'NA'
3348         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3349         mobj = re.search(pattern, webpage)
3350         if mobj:
3351             video_title = mobj.group(1)
3352         self.report_title(video_title)
3353
3354         # Step 2, Stimulate clicking the image box to launch video
3355         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3356         mobj = re.search(pattern, webpage)
3357         if mobj is None:
3358             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3359
3360         video_page = mobj.group(1)
3361         request = compat_urllib_request.Request(video_page)
3362         try:
3363             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3365             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3366             return
3367         self.report_extract_vid_page(video_page)
3368
3369
3370         # Extract video links on video page
3371         """Extract video links of all sizes"""
3372         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3373         mobj = re.findall(pattern, webpage)
3374         if len(mobj) == 0:
3375             self._downloader.trouble(u'ERROR: unable to extract video links')
3376
3377         # Sort in resolution
3378         links = sorted(mobj)
3379
3380         # Choose the lowest of the sort, i.e. highest resolution
3381         video_url = links[-1]
3382         # Only get the url. The resolution part in the tuple has no use anymore
3383         video_url = video_url[-1]
3384         # Treat escaped \u0026 style hex
3385         try:
3386             video_url = video_url.decode("unicode_escape")
3387         except AttributeError: # Python 3
3388             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3389
3390
3391         return [{
3392             'id':       video_id,
3393             'url':      video_url,
3394             'uploader': uploader,
3395             'upload_date':  upload_date,
3396             'title':    video_title,
3397             'ext':      video_extension,
3398         }]
3399
3400 class NBAIE(InfoExtractor):
3401     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3402     IE_NAME = u'nba'
3403
3404     def _real_extract(self, url):
3405         mobj = re.match(self._VALID_URL, url)
3406         if mobj is None:
3407             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3408             return
3409
3410         video_id = mobj.group(1)
3411         if video_id.endswith('/index.html'):
3412             video_id = video_id[:-len('/index.html')]
3413
3414         webpage = self._download_webpage(url, video_id)
3415
3416         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3417         def _findProp(rexp, default=None):
3418             m = re.search(rexp, webpage)
3419             if m:
3420                 return unescapeHTML(m.group(1))
3421             else:
3422                 return default
3423
3424         shortened_video_id = video_id.rpartition('/')[2]
3425         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3426         info = {
3427             'id': shortened_video_id,
3428             'url': video_url,
3429             'ext': 'mp4',
3430             'title': title,
3431             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3432             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3433         }
3434         return [info]
3435
3436 class JustinTVIE(InfoExtractor):
3437     """Information extractor for justin.tv and twitch.tv"""
3438     # TODO: One broadcast may be split into multiple videos. The key
3439     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3440     # starts at 1 and increases. Can we treat all parts as one video?
3441
3442     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3443         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3444     _JUSTIN_PAGE_LIMIT = 100
3445     IE_NAME = u'justin.tv'
3446
3447     def report_extraction(self, file_id):
3448         """Report information extraction."""
3449         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3450
3451     def report_download_page(self, channel, offset):
3452         """Report attempt to download a single page of videos."""
3453         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3454                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3455
3456     # Return count of items, list of *valid* items
3457     def _parse_page(self, url):
3458         try:
3459             urlh = compat_urllib_request.urlopen(url)
3460             webpage_bytes = urlh.read()
3461             webpage = webpage_bytes.decode('utf-8', 'ignore')
3462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3463             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3464             return
3465
3466         response = json.loads(webpage)
3467         if type(response) != list:
3468             error_text = response.get('error', 'unknown error')
3469             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3470             return
3471         info = []
3472         for clip in response:
3473             video_url = clip['video_file_url']
3474             if video_url:
3475                 video_extension = os.path.splitext(video_url)[1][1:]
3476                 video_date = re.sub('-', '', clip['start_time'][:10])
3477                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3478                 video_id = clip['id']
3479                 video_title = clip.get('title', video_id)
3480                 info.append({
3481                     'id': video_id,
3482                     'url': video_url,
3483                     'title': video_title,
3484                     'uploader': clip.get('channel_name', video_uploader_id),
3485                     'uploader_id': video_uploader_id,
3486                     'upload_date': video_date,
3487                     'ext': video_extension,
3488                 })
3489         return (len(response), info)
3490
3491     def _real_extract(self, url):
3492         mobj = re.match(self._VALID_URL, url)
3493         if mobj is None:
3494             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3495             return
3496
3497         api = 'http://api.justin.tv'
3498         video_id = mobj.group(mobj.lastindex)
3499         paged = False
3500         if mobj.lastindex == 1:
3501             paged = True
3502             api += '/channel/archives/%s.json'
3503         else:
3504             api += '/broadcast/by_archive/%s.json'
3505         api = api % (video_id,)
3506
3507         self.report_extraction(video_id)
3508
3509         info = []
3510         offset = 0
3511         limit = self._JUSTIN_PAGE_LIMIT
3512         while True:
3513             if paged:
3514                 self.report_download_page(video_id, offset)
3515             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3516             page_count, page_info = self._parse_page(page_url)
3517             info.extend(page_info)
3518             if not paged or page_count != limit:
3519                 break
3520             offset += limit
3521         return info
3522
3523 class FunnyOrDieIE(InfoExtractor):
3524     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3525
3526     def _real_extract(self, url):
3527         mobj = re.match(self._VALID_URL, url)
3528         if mobj is None:
3529             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3530             return
3531
3532         video_id = mobj.group('id')
3533         webpage = self._download_webpage(url, video_id)
3534
3535         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3536         if not m:
3537             self._downloader.trouble(u'ERROR: unable to find video information')
3538         video_url = unescapeHTML(m.group('url'))
3539
3540         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3541         if not m:
3542             self._downloader.trouble(u'Cannot find video title')
3543         title = unescapeHTML(m.group('title'))
3544
3545         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3546         if m:
3547             desc = unescapeHTML(m.group('desc'))
3548         else:
3549             desc = None
3550
3551         info = {
3552             'id': video_id,
3553             'url': video_url,
3554             'ext': 'mp4',
3555             'title': title,
3556             'description': desc,
3557         }
3558         return [info]
3559
3560 class TweetReelIE(InfoExtractor):
3561     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3562
3563     def _real_extract(self, url):
3564         mobj = re.match(self._VALID_URL, url)
3565         if mobj is None:
3566             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3567             return
3568
3569         video_id = mobj.group('id')
3570         webpage = self._download_webpage(url, video_id)
3571
3572         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3573         if not m:
3574             self._downloader.trouble(u'ERROR: Cannot find status ID')
3575         status_id = m.group(1)
3576
3577         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3578         if not m:
3579             self._downloader.trouble(u'WARNING: Cannot find description')
3580         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3581
3582         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3583         if not m:
3584             self._downloader.trouble(u'ERROR: Cannot find uploader')
3585         uploader = unescapeHTML(m.group('uploader'))
3586         uploader_id = unescapeHTML(m.group('uploader_id'))
3587
3588         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3589         if not m:
3590             self._downloader.trouble(u'ERROR: Cannot find upload date')
3591         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3592
3593         title = desc
3594         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3595
3596         info = {
3597             'id': video_id,
3598             'url': video_url,
3599             'ext': 'mov',
3600             'title': title,
3601             'description': desc,
3602             'uploader': uploader,
3603             'uploader_id': uploader_id,
3604             'internal_id': status_id,
3605             'upload_date': upload_date
3606         }
3607         return [info]
3608         
3609 class SteamIE(InfoExtractor):
3610     _VALID_URL = r"""http://store.steampowered.com/ 
3611                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3612                 (?P<gameID>\d+)/?
3613                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3614                 """
3615
3616     def suitable(self, url):
3617         """Receives a URL and returns True if suitable for this IE."""
3618         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3619
3620     def _real_extract(self, url):
3621         m = re.match(self._VALID_URL, url, re.VERBOSE)
3622         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3623         gameID = m.group('gameID')
3624         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3625         webpage = self._download_webpage(videourl, gameID)
3626         mweb = re.finditer(urlRE, webpage)
3627         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3628         titles = re.finditer(namesRE, webpage)
3629         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3630         thumbs = re.finditer(thumbsRE, webpage)
3631         videos = []
3632         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3633             video_id = vid.group('videoID')
3634             title = vtitle.group('videoName')
3635             video_url = vid.group('videoURL')
3636             video_thumb = thumb.group('thumbnail')
3637             if not video_url:
3638                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3639             info = {
3640                 'id':video_id,
3641                 'url':video_url,
3642                 'ext': 'flv',
3643                 'title': unescapeHTML(title),
3644                 'thumbnail': video_thumb
3645                   }
3646             videos.append(info)
3647         return videos
3648
3649 class UstreamIE(InfoExtractor):
3650     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3651     IE_NAME = u'ustream'
3652
3653     def _real_extract(self, url):
3654         m = re.match(self._VALID_URL, url)
3655         video_id = m.group('videoID')
3656         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3657         webpage = self._download_webpage(url, video_id)
3658         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3659         title = m.group('title')
3660         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3661         uploader = m.group('uploader')
3662         info = {
3663                 'id':video_id,
3664                 'url':video_url,
3665                 'ext': 'flv',
3666                 'title': title,
3667                 'uploader': uploader
3668                   }
3669         return [info]
3670
3671 class RBMARadioIE(InfoExtractor):
3672     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3673
3674     def _real_extract(self, url):
3675         m = re.match(self._VALID_URL, url)
3676         video_id = m.group('videoID')
3677
3678         webpage = self._download_webpage(url, video_id)
3679         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3680         if not m:
3681             raise ExtractorError(u'Cannot find metadata')
3682         json_data = m.group(1)
3683
3684         try:
3685             data = json.loads(json_data)
3686         except ValueError as e:
3687             raise ExtractorError(u'Invalid JSON: ' + str(e))
3688
3689         video_url = data['akamai_url'] + '&cbr=256'
3690         url_parts = compat_urllib_parse_urlparse(video_url)
3691         video_ext = url_parts.path.rpartition('.')[2]
3692         info = {
3693                 'id': video_id,
3694                 'url': video_url,
3695                 'ext': video_ext,
3696                 'title': data['title'],
3697                 'description': data.get('teaser_text'),
3698                 'location': data.get('country_of_origin'),
3699                 'uploader': data.get('host', {}).get('name'),
3700                 'uploader_id': data.get('host', {}).get('slug'),
3701                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3702                 'duration': data.get('duration'),
3703         }
3704         return [info]
3705
3706
3707 class YouPornIE(InfoExtractor):
3708     """Information extractor for youporn.com."""
3709     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3710    
3711     def _print_formats(self, formats):
3712         """Print all available formats"""
3713         print(u'Available formats:')
3714         print(u'ext\t\tformat')
3715         print(u'---------------------------------')
3716         for format in formats:
3717             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3718
3719     def _specific(self, req_format, formats):
3720         for x in formats:
3721             if(x["format"]==req_format):
3722                 return x
3723         return None
3724
3725     def _real_extract(self, url):
3726         mobj = re.match(self._VALID_URL, url)
3727         if mobj is None:
3728             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3729             return
3730
3731         video_id = mobj.group('videoid')
3732
3733         req = compat_urllib_request.Request(url)
3734         req.add_header('Cookie', 'age_verified=1')
3735         webpage = self._download_webpage(req, video_id)
3736
3737         # Get the video title
3738         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3739         if result is None:
3740             raise ExtractorError(u'Unable to extract video title')
3741         video_title = result.group('title').strip()
3742
3743         # Get the video date
3744         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3745         if result is None:
3746             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3747             upload_date = None
3748         else:
3749             upload_date = result.group('date').strip()
3750
3751         # Get the video uploader
3752         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3753         if result is None:
3754             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3755             video_uploader = None
3756         else:
3757             video_uploader = result.group('uploader').strip()
3758             video_uploader = clean_html( video_uploader )
3759
3760         # Get all of the formats available
3761         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3762         result = re.search(DOWNLOAD_LIST_RE, webpage)
3763         if result is None:
3764             raise ExtractorError(u'Unable to extract download list')
3765         download_list_html = result.group('download_list').strip()
3766
3767         # Get all of the links from the page
3768         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3769         links = re.findall(LINK_RE, download_list_html)
3770         if(len(links) == 0):
3771             raise ExtractorError(u'ERROR: no known formats available for video')
3772         
3773         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3774
3775         formats = []
3776         for link in links:
3777
3778             # A link looks like this:
3779             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3780             # A path looks like this:
3781             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3782             video_url = unescapeHTML( link )
3783             path = compat_urllib_parse_urlparse( video_url ).path
3784             extension = os.path.splitext( path )[1][1:]
3785             format = path.split('/')[4].split('_')[:2]
3786             size = format[0]
3787             bitrate = format[1]
3788             format = "-".join( format )
3789             title = u'%s-%s-%s' % (video_title, size, bitrate)
3790
3791             formats.append({
3792                 'id': video_id,
3793                 'url': video_url,
3794                 'uploader': video_uploader,
3795                 'upload_date': upload_date,
3796                 'title': title,
3797                 'ext': extension,
3798                 'format': format,
3799                 'thumbnail': None,
3800                 'description': None,
3801                 'player_url': None
3802             })
3803
3804         if self._downloader.params.get('listformats', None):
3805             self._print_formats(formats)
3806             return
3807
3808         req_format = self._downloader.params.get('format', None)
3809         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3810
3811         if req_format is None or req_format == 'best':
3812             return [formats[0]]
3813         elif req_format == 'worst':
3814             return [formats[-1]]
3815         elif req_format in ('-1', 'all'):
3816             return formats
3817         else:
3818             format = self._specific( req_format, formats )
3819             if result is None:
3820                 self._downloader.trouble(u'ERROR: requested format not available')
3821                 return
3822             return [format]
3823
3824         
3825
3826 class PornotubeIE(InfoExtractor):
3827     """Information extractor for pornotube.com."""
3828     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3829
3830     def _real_extract(self, url):
3831         mobj = re.match(self._VALID_URL, url)
3832         if mobj is None:
3833             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3834             return
3835
3836         video_id = mobj.group('videoid')
3837         video_title = mobj.group('title')
3838
3839         # Get webpage content
3840         webpage = self._download_webpage(url, video_id)
3841
3842         # Get the video URL
3843         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3844         result = re.search(VIDEO_URL_RE, webpage)
3845         if result is None:
3846             self._downloader.trouble(u'ERROR: unable to extract video url')
3847             return
3848         video_url = compat_urllib_parse.unquote(result.group('url'))
3849
3850         #Get the uploaded date
3851         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3852         result = re.search(VIDEO_UPLOADED_RE, webpage)
3853         if result is None:
3854             self._downloader.trouble(u'ERROR: unable to extract video title')
3855             return
3856         upload_date = result.group('date')
3857
3858         info = {'id': video_id,
3859                 'url': video_url,
3860                 'uploader': None,
3861                 'upload_date': upload_date,
3862                 'title': video_title,
3863                 'ext': 'flv',
3864                 'format': 'flv'}
3865
3866         return [info]
3867
3868 class YouJizzIE(InfoExtractor):
3869     """Information extractor for youjizz.com."""
3870     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3871
3872     def _real_extract(self, url):
3873         mobj = re.match(self._VALID_URL, url)
3874         if mobj is None:
3875             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3876             return
3877
3878         video_id = mobj.group('videoid')
3879
3880         # Get webpage content
3881         webpage = self._download_webpage(url, video_id)
3882
3883         # Get the video title
3884         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3885         if result is None:
3886             raise ExtractorError(u'ERROR: unable to extract video title')
3887         video_title = result.group('title').strip()
3888
3889         # Get the embed page
3890         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3891         if result is None:
3892             raise ExtractorError(u'ERROR: unable to extract embed page')
3893
3894         embed_page_url = result.group(0).strip()
3895         video_id = result.group('videoid')
3896     
3897         webpage = self._download_webpage(embed_page_url, video_id)
3898
3899         # Get the video URL
3900         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3901         if result is None:
3902             raise ExtractorError(u'ERROR: unable to extract video url')
3903         video_url = result.group('source')
3904
3905         info = {'id': video_id,
3906                 'url': video_url,
3907                 'title': video_title,
3908                 'ext': 'flv',
3909                 'format': 'flv',
3910                 'player_url': embed_page_url}
3911
3912         return [info]
3913
3914 class EightTracksIE(InfoExtractor):
3915     IE_NAME = '8tracks'
3916     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3917
3918     def _real_extract(self, url):
3919         mobj = re.match(self._VALID_URL, url)
3920         if mobj is None:
3921             raise ExtractorError(u'Invalid URL: %s' % url)
3922         playlist_id = mobj.group('id')
3923
3924         webpage = self._download_webpage(url, playlist_id)
3925
3926         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3927         if not m:
3928             raise ExtractorError(u'Cannot find trax information')
3929         json_like = m.group(1)
3930         data = json.loads(json_like)
3931
3932         session = str(random.randint(0, 1000000000))
3933         mix_id = data['id']
3934         track_count = data['tracks_count']
3935         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3936         next_url = first_url
3937         res = []
3938         for i in itertools.count():
3939             api_json = self._download_webpage(next_url, playlist_id,
3940                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3941                 errnote=u'Failed to download song information')
3942             api_data = json.loads(api_json)
3943             track_data = api_data[u'set']['track']
3944             info = {
3945                 'id': track_data['id'],
3946                 'url': track_data['track_file_stream_url'],
3947                 'title': track_data['performer'] + u' - ' + track_data['name'],
3948                 'raw_title': track_data['name'],
3949                 'uploader_id': data['user']['login'],
3950                 'ext': 'm4a',
3951             }
3952             res.append(info)
3953             if api_data['set']['at_last_track']:
3954                 break
3955             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3956         return res
3957
3958 class KeekIE(InfoExtractor):
3959     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3960     IE_NAME = u'keek'
3961
3962     def _real_extract(self, url):
3963         m = re.match(self._VALID_URL, url)
3964         video_id = m.group('videoID')
3965         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3966         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3967         webpage = self._download_webpage(url, video_id)
3968         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3969         title = unescapeHTML(m.group('title'))
3970         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3971         uploader = unescapeHTML(m.group('uploader'))
3972         info = {
3973                 'id':video_id,
3974                 'url':video_url,
3975                 'ext': 'mp4',
3976                 'title': title,
3977                 'thumbnail': thumbnail,
3978                 'uploader': uploader
3979         }
3980         return [info]
3981
3982 class TEDIE(InfoExtractor):
3983     _VALID_URL=r'''http://www.ted.com/
3984                    (
3985                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3986                         |
3987                         ((?P<type_talk>talks)) # We have a simple talk
3988                    )
3989                    /(?P<name>\w+) # Here goes the name and then ".html"
3990                    '''
3991
3992     def suitable(self, url):
3993         """Receives a URL and returns True if suitable for this IE."""
3994         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3995
3996     def _real_extract(self, url):
3997         m=re.match(self._VALID_URL, url, re.VERBOSE)
3998         if m.group('type_talk'):
3999             return [self._talk_info(url)]
4000         else :
4001             playlist_id=m.group('playlist_id')
4002             name=m.group('name')
4003             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4004             return self._playlist_videos_info(url,name,playlist_id)
4005
4006     def _talk_video_link(self,mediaSlug):
4007         '''Returns the video link for that mediaSlug'''
4008         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4009
4010     def _playlist_videos_info(self,url,name,playlist_id=0):
4011         '''Returns the videos of the playlist'''
4012         video_RE=r'''
4013                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4014                      ([.\s]*?)data-playlist_item_id="(\d+)"
4015                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4016                      '''
4017         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4018         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4019         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4020         m_names=re.finditer(video_name_RE,webpage)
4021         info=[]
4022         for m_video, m_name in zip(m_videos,m_names):
4023             video_id=m_video.group('video_id')
4024             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4025             info.append(self._talk_info(talk_url,video_id))
4026         return info
4027
4028     def _talk_info(self, url, video_id=0):
4029         """Return the video for the talk in the url"""
4030         m=re.match(self._VALID_URL, url,re.VERBOSE)
4031         videoName=m.group('name')
4032         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4033         # If the url includes the language we get the title translated
4034         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4035         title=re.search(title_RE, webpage).group('title')
4036         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4037                         "id":(?P<videoID>[\d]+).*?
4038                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4039         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4040         thumb_match=re.search(thumb_RE,webpage)
4041         info_match=re.search(info_RE,webpage,re.VERBOSE)
4042         video_id=info_match.group('videoID')
4043         mediaSlug=info_match.group('mediaSlug')
4044         video_url=self._talk_video_link(mediaSlug)
4045         info = {
4046                 'id': video_id,
4047                 'url': video_url,
4048                 'ext': 'mp4',
4049                 'title': title,
4050                 'thumbnail': thumb_match.group('thumbnail')
4051                 }
4052         return info
4053
4054 class MySpassIE(InfoExtractor):
4055     _VALID_URL = r'http://www.myspass.de/.*'
4056     
4057     def _real_extract(self, url):
4058         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4059
4060         # video id is the last path element of the URL
4061         # usually there is a trailing slash, so also try the second but last
4062         url_path = compat_urllib_parse_urlparse(url).path
4063         url_parent_path, video_id = os.path.split(url_path)
4064         if not video_id:
4065             _, video_id = os.path.split(url_parent_path)
4066         
4067         # get metadata
4068         metadata_url = META_DATA_URL_TEMPLATE % video_id
4069         metadata_text = self._download_webpage(metadata_url, video_id)
4070         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4071         
4072         # extract values from metadata
4073         url_flv_el = metadata.find('url_flv')
4074         if url_flv_el is None:
4075             self._downloader.trouble(u'ERROR: unable to extract download url')
4076             return
4077         video_url = url_flv_el.text
4078         extension = os.path.splitext(video_url)[1][1:]
4079         title_el = metadata.find('title')
4080         if title_el is None:
4081             self._downloader.trouble(u'ERROR: unable to extract title')
4082             return
4083         title = title_el.text
4084         format_id_el = metadata.find('format_id')
4085         if format_id_el is None:
4086             format = ext
4087         else:
4088             format = format_id_el.text
4089         description_el = metadata.find('description')
4090         if description_el is not None:
4091             description = description_el.text
4092         else:
4093             description = None
4094         imagePreview_el = metadata.find('imagePreview')
4095         if imagePreview_el is not None:
4096             thumbnail = imagePreview_el.text
4097         else:
4098             thumbnail = None
4099         info = {
4100             'id': video_id,
4101             'url': video_url,
4102             'title': title,
4103             'ext': extension,
4104             'format': format,
4105             'thumbnail': thumbnail,
4106             'description': description
4107         }
4108         return [info]
4109
4110 def gen_extractors():
4111     """ Return a list of an instance of every supported extractor.
4112     The order does matter; the first extractor matched is the one handling the URL.
4113     """
4114     return [
4115         YoutubePlaylistIE(),
4116         YoutubeChannelIE(),
4117         YoutubeUserIE(),
4118         YoutubeSearchIE(),
4119         YoutubeIE(),
4120         MetacafeIE(),
4121         DailymotionIE(),
4122         GoogleSearchIE(),
4123         PhotobucketIE(),
4124         YahooIE(),
4125         YahooSearchIE(),
4126         DepositFilesIE(),
4127         FacebookIE(),
4128         BlipTVUserIE(),
4129         BlipTVIE(),
4130         VimeoIE(),
4131         MyVideoIE(),
4132         ComedyCentralIE(),
4133         EscapistIE(),
4134         CollegeHumorIE(),
4135         XVideosIE(),
4136         SoundcloudIE(),
4137         InfoQIE(),
4138         MixcloudIE(),
4139         StanfordOpenClassroomIE(),
4140         MTVIE(),
4141         YoukuIE(),
4142         XNXXIE(),
4143         YouJizzIE(),
4144         PornotubeIE(),
4145         YouPornIE(),
4146         GooglePlusIE(),
4147         ArteTvIE(),
4148         NBAIE(),
4149         JustinTVIE(),
4150         FunnyOrDieIE(),
4151         TweetReelIE(),
4152         SteamIE(),
4153         UstreamIE(),
4154         RBMARadioIE(),
4155         EightTracksIE(),
4156         KeekIE(),
4157         TEDIE(),
4158         MySpassIE(),
4159         GenericIE()
4160     ]
4161
4162