Merge remote-tracking branch 'jaimeMF/TED'
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23     """Information Extractor class.
24
25     Information extractors are the classes that, given a URL, extract
26     information about the video (or videos) the URL refers to. This
27     information includes the real video URL, the video title, author and
28     others. The information is stored in a dictionary which is then
29     passed to the FileDownloader. The FileDownloader processes this
30     information possibly downloading the video to the file system, among
31     other possible outcomes.
32
33     The dictionaries must include the following fields:
34
35     id:             Video identifier.
36     url:            Final video URL.
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader:       Full name of the video uploader.
46     upload_date:    Video upload date (YYYYMMDD).
47     uploader_id:    Nickname or id of the video uploader.
48     location:       Physical location of the video.
49     player_url:     SWF Player URL (used for rtmpdump).
50     subtitles:      The .srt file contents.
51     urlhandle:      [internal] The urlHandle to be used to download the file,
52                     like returned by urllib.request.urlopen
53
54     The fields should all be Unicode strings.
55
56     Subclasses of this one should re-define the _real_initialize() and
57     _real_extract() methods and define a _VALID_URL regexp.
58     Probably, they should also be added to the list of extractors.
59
60     _real_extract() must return a *list* of information dictionaries as
61     described above.
62
63     Finally, the _WORKING attribute should be set to False for broken IEs
64     in order to warn the users and skip the tests.
65     """
66
67     _ready = False
68     _downloader = None
69     _WORKING = True
70
71     def __init__(self, downloader=None):
72         """Constructor. Receives an optional downloader."""
73         self._ready = False
74         self.set_downloader(downloader)
75
76     def suitable(self, url):
77         """Receives a URL and returns True if suitable for this IE."""
78         return re.match(self._VALID_URL, url) is not None
79
80     def working(self):
81         """Getter method for _WORKING."""
82         return self._WORKING
83
84     def initialize(self):
85         """Initializes an instance (authentication, etc)."""
86         if not self._ready:
87             self._real_initialize()
88             self._ready = True
89
90     def extract(self, url):
91         """Extracts URL information and returns it in list of dicts."""
92         self.initialize()
93         return self._real_extract(url)
94
95     def set_downloader(self, downloader):
96         """Sets the downloader for this IE."""
97         self._downloader = downloader
98
99     def _real_initialize(self):
100         """Real initialization process. Redefine in subclasses."""
101         pass
102
103     def _real_extract(self, url):
104         """Real extraction process. Redefine in subclasses."""
105         pass
106
107     @property
108     def IE_NAME(self):
109         return type(self).__name__[:-2]
110
111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112         """ Returns the response handle """
113         if note is None:
114             note = u'Downloading video webpage'
115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116         try:
117             return compat_urllib_request.urlopen(url_or_request)
118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119             if errnote is None:
120                 errnote = u'Unable to download webpage'
121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124         """ Returns the data of the page as a string """
125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126         webpage_bytes = urlh.read()
127         return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131     """Information extractor for youtube.com."""
132
133     _VALID_URL = r"""^
134                      (
135                          (?:https?://)?                                       # http(s):// (optional)
136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
140                          (?:                                                  # the various things that can precede the ID:
141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
142                              |(?:                                             # or the v= param in all its forms
143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
146                                  v=
147                              )
148                          )?                                                   # optional -> youtube.com/xxxx is OK
149                      )?                                                       # all until now is optional -> you can pass the naked ID
150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
151                      (?(1).+)?                                                # if we found the ID, everything can follow
152                      $"""
153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157     _NETRC_MACHINE = 'youtube'
158     # Listed in order of quality
159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161     _video_extensions = {
162         '13': '3gp',
163         '17': 'mp4',
164         '18': 'mp4',
165         '22': 'mp4',
166         '37': 'mp4',
167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168         '43': 'webm',
169         '44': 'webm',
170         '45': 'webm',
171         '46': 'webm',
172     }
173     _video_dimensions = {
174         '5': '240x400',
175         '6': '???',
176         '13': '???',
177         '17': '144x176',
178         '18': '360x640',
179         '22': '720x1280',
180         '34': '360x640',
181         '35': '480x854',
182         '37': '1080x1920',
183         '38': '3072x4096',
184         '43': '360x640',
185         '44': '480x854',
186         '45': '720x1280',
187         '46': '1080x1920',
188     }
189     IE_NAME = u'youtube'
190
191     def suitable(self, url):
192         """Receives a URL and returns True if suitable for this IE."""
193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195     def report_lang(self):
196         """Report attempt to set language."""
197         self._downloader.to_screen(u'[youtube] Setting language')
198
199     def report_login(self):
200         """Report attempt to log in."""
201         self._downloader.to_screen(u'[youtube] Logging in')
202
203     def report_age_confirmation(self):
204         """Report attempt to confirm age."""
205         self._downloader.to_screen(u'[youtube] Confirming age')
206
207     def report_video_webpage_download(self, video_id):
208         """Report attempt to download video webpage."""
209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211     def report_video_info_webpage_download(self, video_id):
212         """Report attempt to download video info webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215     def report_video_subtitles_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219     def report_information_extraction(self, video_id):
220         """Report attempt to extract video information."""
221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223     def report_unavailable_format(self, video_id, format):
224         """Report extracted video URL."""
225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227     def report_rtmp_download(self):
228         """Indicate the download will use the RTMP protocol."""
229         self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231     def _closed_captions_xml_to_srt(self, xml_string):
232         srt = ''
233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234         # TODO parse xml instead of regex
235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
236             if not dur: dur = '4'
237             start = float(start)
238             end = start + float(dur)
239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241             caption = unescapeHTML(caption)
242             caption = unescapeHTML(caption) # double cycle, intentional
243             srt += str(n+1) + '\n'
244             srt += start + ' --> ' + end + '\n'
245             srt += caption + '\n\n'
246         return srt
247
248     def _extract_subtitles(self, video_id):
249         self.report_video_subtitles_download(video_id)
250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251         try:
252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257         if not srt_lang_list:
258             return (u'WARNING: video has no closed captions', None)
259         if self._downloader.params.get('subtitleslang', False):
260             srt_lang = self._downloader.params.get('subtitleslang')
261         elif 'en' in srt_lang_list:
262             srt_lang = 'en'
263         else:
264             srt_lang = list(srt_lang_list.keys())[0]
265         if not srt_lang in srt_lang_list:
266             return (u'WARNING: no closed captions found in the specified language', None)
267         params = compat_urllib_parse.urlencode({
268             'lang': srt_lang,
269             'name': srt_lang_list[srt_lang].encode('utf-8'),
270             'v': video_id,
271         })
272         url = 'http://www.youtube.com/api/timedtext?' + params
273         try:
274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277         if not srt_xml:
278             return (u'WARNING: Did not fetch video subtitles', None)
279         return (None, self._closed_captions_xml_to_srt(srt_xml))
280
281     def _print_formats(self, formats):
282         print('Available formats:')
283         for x in formats:
284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
285
286     def _real_initialize(self):
287         if self._downloader is None:
288             return
289
290         username = None
291         password = None
292         downloader_params = self._downloader.params
293
294         # Attempt to use provided username and password or .netrc data
295         if downloader_params.get('username', None) is not None:
296             username = downloader_params['username']
297             password = downloader_params['password']
298         elif downloader_params.get('usenetrc', False):
299             try:
300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
301                 if info is not None:
302                     username = info[0]
303                     password = info[2]
304                 else:
305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306             except (IOError, netrc.NetrcParseError) as err:
307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
308                 return
309
310         # Set language
311         request = compat_urllib_request.Request(self._LANG_URL)
312         try:
313             self.report_lang()
314             compat_urllib_request.urlopen(request).read()
315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
317             return
318
319         # No authentication to be performed
320         if username is None:
321             return
322
323         request = compat_urllib_request.Request(self._LOGIN_URL)
324         try:
325             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
328             return
329
330         galx = None
331         dsh = None
332         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
333         if match:
334           galx = match.group(1)
335
336         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           dsh = match.group(1)
339
340         # Log in
341         login_form_strs = {
342                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
343                 u'Email': username,
344                 u'GALX': galx,
345                 u'Passwd': password,
346                 u'PersistentCookie': u'yes',
347                 u'_utf8': u'霱',
348                 u'bgresponse': u'js_disabled',
349                 u'checkConnection': u'',
350                 u'checkedDomains': u'youtube',
351                 u'dnConn': u'',
352                 u'dsh': dsh,
353                 u'pstMsg': u'0',
354                 u'rmShown': u'1',
355                 u'secTok': u'',
356                 u'signIn': u'Sign in',
357                 u'timeStmp': u'',
358                 u'service': u'youtube',
359                 u'uilel': u'3',
360                 u'hl': u'en_US',
361         }
362         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
363         # chokes on unicode
364         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
367         try:
368             self.report_login()
369             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
372                 return
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
375             return
376
377         # Confirm age
378         age_form = {
379                 'next_url':     '/',
380                 'action_confirm':   'Confirm',
381                 }
382         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
383         try:
384             self.report_age_confirmation()
385             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
388             return
389
390     def _extract_id(self, url):
391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
392         if mobj is None:
393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
394             return
395         video_id = mobj.group(2)
396         return video_id
397
398     def _real_extract(self, url):
399         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400         mobj = re.search(self._NEXT_URL_RE, url)
401         if mobj:
402             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403         video_id = self._extract_id(url)
404
405         # Get video webpage
406         self.report_video_webpage_download(video_id)
407         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408         request = compat_urllib_request.Request(url)
409         try:
410             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
413             return
414
415         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
416
417         # Attempt to extract SWF player URL
418         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
419         if mobj is not None:
420             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
421         else:
422             player_url = None
423
424         # Get video info
425         self.report_video_info_webpage_download(video_id)
426         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428                     % (video_id, el_type))
429             request = compat_urllib_request.Request(video_info_url)
430             try:
431                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433                 video_info = compat_parse_qs(video_info_webpage)
434                 if 'token' in video_info:
435                     break
436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
438                 return
439         if 'token' not in video_info:
440             if 'reason' in video_info:
441                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
442             else:
443                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
444             return
445
446         # Check for "rental" videos
447         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448             self._downloader.trouble(u'ERROR: "rental" videos not supported')
449             return
450
451         # Start extracting information
452         self.report_information_extraction(video_id)
453
454         # uploader
455         if 'author' not in video_info:
456             self._downloader.trouble(u'ERROR: unable to extract uploader name')
457             return
458         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
459
460         # uploader_id
461         video_uploader_id = None
462         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
463         if mobj is not None:
464             video_uploader_id = mobj.group(1)
465         else:
466             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
467
468         # title
469         if 'title' not in video_info:
470             self._downloader.trouble(u'ERROR: unable to extract video title')
471             return
472         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
473
474         # thumbnail image
475         if 'thumbnail_url' not in video_info:
476             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
477             video_thumbnail = ''
478         else:   # don't panic if we can't find it
479             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
480
481         # upload date
482         upload_date = None
483         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
484         if mobj is not None:
485             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487             for expression in format_expressions:
488                 try:
489                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
490                 except:
491                     pass
492
493         # description
494         video_description = get_element_by_id("eow-description", video_webpage)
495         if video_description:
496             video_description = clean_html(video_description)
497         else:
498             video_description = ''
499
500         # closed captions
501         video_subtitles = None
502         if self._downloader.params.get('writesubtitles', False):
503             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
504             if srt_error:
505                 self._downloader.trouble(srt_error)
506
507         if 'length_seconds' not in video_info:
508             self._downloader.trouble(u'WARNING: unable to extract video duration')
509             video_duration = ''
510         else:
511             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
512
513         # token
514         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
515
516         # Decide which formats to download
517         req_format = self._downloader.params.get('format', None)
518
519         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520             self.report_rtmp_download()
521             video_url_list = [(None, video_info['conn'][0])]
522         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
527
528             format_limit = self._downloader.params.get('format_limit', None)
529             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530             if format_limit is not None and format_limit in available_formats:
531                 format_list = available_formats[available_formats.index(format_limit):]
532             else:
533                 format_list = available_formats
534             existing_formats = [x for x in format_list if x in url_map]
535             if len(existing_formats) == 0:
536                 self._downloader.trouble(u'ERROR: no known formats available for video')
537                 return
538             if self._downloader.params.get('listformats', None):
539                 self._print_formats(existing_formats)
540                 return
541             if req_format is None or req_format == 'best':
542                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543             elif req_format == 'worst':
544                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545             elif req_format in ('-1', 'all'):
546                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
547             else:
548                 # Specific formats. We pick the first in a slash-delimeted sequence.
549                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550                 req_formats = req_format.split('/')
551                 video_url_list = None
552                 for rf in req_formats:
553                     if rf in url_map:
554                         video_url_list = [(rf, url_map[rf])]
555                         break
556                 if video_url_list is None:
557                     self._downloader.trouble(u'ERROR: requested format not available')
558                     return
559         else:
560             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
561             return
562
563         results = []
564         for format_param, video_real_url in video_url_list:
565             # Extension
566             video_extension = self._video_extensions.get(format_param, 'flv')
567
568             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569                                               self._video_dimensions.get(format_param, '???'))
570
571             results.append({
572                 'id':       video_id,
573                 'url':      video_real_url,
574                 'uploader': video_uploader,
575                 'uploader_id': video_uploader_id,
576                 'upload_date':  upload_date,
577                 'title':    video_title,
578                 'ext':      video_extension,
579                 'format':   video_format,
580                 'thumbnail':    video_thumbnail,
581                 'description':  video_description,
582                 'player_url':   player_url,
583                 'subtitles':    video_subtitles,
584                 'duration':     video_duration
585             })
586         return results
587
588
589 class MetacafeIE(InfoExtractor):
590     """Information Extractor for metacafe.com."""
591
592     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595     IE_NAME = u'metacafe'
596
597     def __init__(self, downloader=None):
598         InfoExtractor.__init__(self, downloader)
599
600     def report_disclaimer(self):
601         """Report disclaimer retrieval."""
602         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
603
604     def report_age_confirmation(self):
605         """Report attempt to confirm age."""
606         self._downloader.to_screen(u'[metacafe] Confirming age')
607
608     def report_download_webpage(self, video_id):
609         """Report webpage download."""
610         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
611
612     def report_extraction(self, video_id):
613         """Report information extraction."""
614         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
615
616     def _real_initialize(self):
617         # Retrieve disclaimer
618         request = compat_urllib_request.Request(self._DISCLAIMER)
619         try:
620             self.report_disclaimer()
621             disclaimer = compat_urllib_request.urlopen(request).read()
622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
624             return
625
626         # Confirm age
627         disclaimer_form = {
628             'filters': '0',
629             'submit': "Continue - I'm over 18",
630             }
631         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
632         try:
633             self.report_age_confirmation()
634             disclaimer = compat_urllib_request.urlopen(request).read()
635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
637             return
638
639     def _real_extract(self, url):
640         # Extract id and simplified title from URL
641         mobj = re.match(self._VALID_URL, url)
642         if mobj is None:
643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
644             return
645
646         video_id = mobj.group(1)
647
648         # Check if video comes from YouTube
649         mobj2 = re.match(r'^yt-(.*)$', video_id)
650         if mobj2 is not None:
651             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
652             return
653
654         # Retrieve video webpage to extract further information
655         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
656         try:
657             self.report_download_webpage(video_id)
658             webpage = compat_urllib_request.urlopen(request).read()
659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
661             return
662
663         # Extract URL, uploader and title from webpage
664         self.report_extraction(video_id)
665         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
666         if mobj is not None:
667             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668             video_extension = mediaURL[-3:]
669
670             # Extract gdaKey if available
671             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
672             if mobj is None:
673                 video_url = mediaURL
674             else:
675                 gdaKey = mobj.group(1)
676                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
677         else:
678             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
679             if mobj is None:
680                 self._downloader.trouble(u'ERROR: unable to extract media URL')
681                 return
682             vardict = compat_parse_qs(mobj.group(1))
683             if 'mediaData' not in vardict:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
687             if mobj is None:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mediaURL = mobj.group(1).replace('\\/', '/')
691             video_extension = mediaURL[-3:]
692             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
693
694         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
695         if mobj is None:
696             self._downloader.trouble(u'ERROR: unable to extract title')
697             return
698         video_title = mobj.group(1).decode('utf-8')
699
700         mobj = re.search(r'submitter=(.*?);', webpage)
701         if mobj is None:
702             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
703             return
704         video_uploader = mobj.group(1)
705
706         return [{
707             'id':       video_id.decode('utf-8'),
708             'url':      video_url.decode('utf-8'),
709             'uploader': video_uploader.decode('utf-8'),
710             'upload_date':  None,
711             'title':    video_title,
712             'ext':      video_extension.decode('utf-8'),
713         }]
714
715
716 class DailymotionIE(InfoExtractor):
717     """Information Extractor for Dailymotion"""
718
719     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720     IE_NAME = u'dailymotion'
721     _WORKING = False
722
723     def __init__(self, downloader=None):
724         InfoExtractor.__init__(self, downloader)
725
726     def report_extraction(self, video_id):
727         """Report information extraction."""
728         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
729
730     def _real_extract(self, url):
731         # Extract id and simplified title from URL
732         mobj = re.match(self._VALID_URL, url)
733         if mobj is None:
734             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
735             return
736
737         video_id = mobj.group(1).split('_')[0].split('?')[0]
738
739         video_extension = 'mp4'
740
741         # Retrieve video webpage to extract further information
742         request = compat_urllib_request.Request(url)
743         request.add_header('Cookie', 'family_filter=off')
744         webpage = self._download_webpage(request, video_id)
745
746         # Extract URL, uploader and title from webpage
747         self.report_extraction(video_id)
748         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
749         if mobj is None:
750             self._downloader.trouble(u'ERROR: unable to extract media URL')
751             return
752         flashvars = compat_urllib_parse.unquote(mobj.group(1))
753
754         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
755             if key in flashvars:
756                 max_quality = key
757                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
758                 break
759         else:
760             self._downloader.trouble(u'ERROR: unable to extract video URL')
761             return
762
763         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
764         if mobj is None:
765             self._downloader.trouble(u'ERROR: unable to extract video URL')
766             return
767
768         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
769
770         # TODO: support choosing qualities
771
772         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
773         if mobj is None:
774             self._downloader.trouble(u'ERROR: unable to extract title')
775             return
776         video_title = unescapeHTML(mobj.group('title'))
777
778         video_uploader = None
779         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
780         if mobj is None:
781             # lookin for official user
782             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783             if mobj_official is None:
784                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
785             else:
786                 video_uploader = mobj_official.group(1)
787         else:
788             video_uploader = mobj.group(1)
789
790         video_upload_date = None
791         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
792         if mobj is not None:
793             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
794
795         return [{
796             'id':       video_id,
797             'url':      video_url,
798             'uploader': video_uploader,
799             'upload_date':  video_upload_date,
800             'title':    video_title,
801             'ext':      video_extension,
802         }]
803
804
805 class PhotobucketIE(InfoExtractor):
806     """Information extractor for photobucket.com."""
807
808     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809     IE_NAME = u'photobucket'
810
811     def __init__(self, downloader=None):
812         InfoExtractor.__init__(self, downloader)
813
814     def report_download_webpage(self, video_id):
815         """Report webpage download."""
816         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
817
818     def report_extraction(self, video_id):
819         """Report information extraction."""
820         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
821
822     def _real_extract(self, url):
823         # Extract id from URL
824         mobj = re.match(self._VALID_URL, url)
825         if mobj is None:
826             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
827             return
828
829         video_id = mobj.group(1)
830
831         video_extension = 'flv'
832
833         # Retrieve video webpage to extract further information
834         request = compat_urllib_request.Request(url)
835         try:
836             self.report_download_webpage(video_id)
837             webpage = compat_urllib_request.urlopen(request).read()
838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
840             return
841
842         # Extract URL, uploader, and title from webpage
843         self.report_extraction(video_id)
844         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
845         if mobj is None:
846             self._downloader.trouble(u'ERROR: unable to extract media URL')
847             return
848         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
849
850         video_url = mediaURL
851
852         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
853         if mobj is None:
854             self._downloader.trouble(u'ERROR: unable to extract title')
855             return
856         video_title = mobj.group(1).decode('utf-8')
857
858         video_uploader = mobj.group(2).decode('utf-8')
859
860         return [{
861             'id':       video_id.decode('utf-8'),
862             'url':      video_url.decode('utf-8'),
863             'uploader': video_uploader,
864             'upload_date':  None,
865             'title':    video_title,
866             'ext':      video_extension.decode('utf-8'),
867         }]
868
869
870 class YahooIE(InfoExtractor):
871     """Information extractor for video.yahoo.com."""
872
873     _WORKING = False
874     # _VALID_URL matches all Yahoo! Video URLs
875     # _VPAGE_URL matches only the extractable '/watch/' URLs
876     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878     IE_NAME = u'video.yahoo'
879
880     def __init__(self, downloader=None):
881         InfoExtractor.__init__(self, downloader)
882
883     def report_download_webpage(self, video_id):
884         """Report webpage download."""
885         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
886
887     def report_extraction(self, video_id):
888         """Report information extraction."""
889         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
890
891     def _real_extract(self, url, new_video=True):
892         # Extract ID from URL
893         mobj = re.match(self._VALID_URL, url)
894         if mobj is None:
895             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
896             return
897
898         video_id = mobj.group(2)
899         video_extension = 'flv'
900
901         # Rewrite valid but non-extractable URLs as
902         # extractable English language /watch/ URLs
903         if re.match(self._VPAGE_URL, url) is None:
904             request = compat_urllib_request.Request(url)
905             try:
906                 webpage = compat_urllib_request.urlopen(request).read()
907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
909                 return
910
911             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
912             if mobj is None:
913                 self._downloader.trouble(u'ERROR: Unable to extract id field')
914                 return
915             yahoo_id = mobj.group(1)
916
917             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
918             if mobj is None:
919                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
920                 return
921             yahoo_vid = mobj.group(1)
922
923             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924             return self._real_extract(url, new_video=False)
925
926         # Retrieve video webpage to extract further information
927         request = compat_urllib_request.Request(url)
928         try:
929             self.report_download_webpage(video_id)
930             webpage = compat_urllib_request.urlopen(request).read()
931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933             return
934
935         # Extract uploader and title from webpage
936         self.report_extraction(video_id)
937         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
938         if mobj is None:
939             self._downloader.trouble(u'ERROR: unable to extract video title')
940             return
941         video_title = mobj.group(1).decode('utf-8')
942
943         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
944         if mobj is None:
945             self._downloader.trouble(u'ERROR: unable to extract video uploader')
946             return
947         video_uploader = mobj.group(1).decode('utf-8')
948
949         # Extract video thumbnail
950         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
951         if mobj is None:
952             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
953             return
954         video_thumbnail = mobj.group(1).decode('utf-8')
955
956         # Extract video description
957         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
958         if mobj is None:
959             self._downloader.trouble(u'ERROR: unable to extract video description')
960             return
961         video_description = mobj.group(1).decode('utf-8')
962         if not video_description:
963             video_description = 'No description available.'
964
965         # Extract video height and width
966         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
967         if mobj is None:
968             self._downloader.trouble(u'ERROR: unable to extract video height')
969             return
970         yv_video_height = mobj.group(1)
971
972         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
973         if mobj is None:
974             self._downloader.trouble(u'ERROR: unable to extract video width')
975             return
976         yv_video_width = mobj.group(1)
977
978         # Retrieve video playlist to extract media URL
979         # I'm not completely sure what all these options are, but we
980         # seem to need most of them, otherwise the server sends a 401.
981         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
982         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
983         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
986         try:
987             self.report_download_webpage(video_id)
988             webpage = compat_urllib_request.urlopen(request).read()
989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
991             return
992
993         # Extract media URL from playlist XML
994         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
995         if mobj is None:
996             self._downloader.trouble(u'ERROR: Unable to extract media URL')
997             return
998         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999         video_url = unescapeHTML(video_url)
1000
1001         return [{
1002             'id':       video_id.decode('utf-8'),
1003             'url':      video_url,
1004             'uploader': video_uploader,
1005             'upload_date':  None,
1006             'title':    video_title,
1007             'ext':      video_extension.decode('utf-8'),
1008             'thumbnail':    video_thumbnail.decode('utf-8'),
1009             'description':  video_description,
1010         }]
1011
1012
1013 class VimeoIE(InfoExtractor):
1014     """Information extractor for vimeo.com."""
1015
1016     # _VALID_URL matches Vimeo URLs
1017     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1018     IE_NAME = u'vimeo'
1019
1020     def __init__(self, downloader=None):
1021         InfoExtractor.__init__(self, downloader)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026
1027     def report_extraction(self, video_id):
1028         """Report information extraction."""
1029         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1036             return
1037
1038         video_id = mobj.group('id')
1039         if not mobj.group('proto'):
1040             url = 'https://' + url
1041         if mobj.group('direct_link'):
1042             url = 'https://vimeo.com/' + video_id
1043
1044         # Retrieve video webpage to extract further information
1045         request = compat_urllib_request.Request(url, None, std_headers)
1046         try:
1047             self.report_download_webpage(video_id)
1048             webpage_bytes = compat_urllib_request.urlopen(request).read()
1049             webpage = webpage_bytes.decode('utf-8')
1050         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052             return
1053
1054         # Now we begin extracting as much information as we can from what we
1055         # retrieved. First we extract the information common to all extractors,
1056         # and latter we extract those that are Vimeo specific.
1057         self.report_extraction(video_id)
1058
1059         # Extract the config JSON
1060         try:
1061             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062             config = json.loads(config)
1063         except:
1064             self._downloader.trouble(u'ERROR: unable to extract info section')
1065             return
1066
1067         # Extract title
1068         video_title = config["video"]["title"]
1069
1070         # Extract uploader and uploader_id
1071         video_uploader = config["video"]["owner"]["name"]
1072         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1073
1074         # Extract video thumbnail
1075         video_thumbnail = config["video"]["thumbnail"]
1076
1077         # Extract video description
1078         video_description = get_element_by_attribute("itemprop", "description", webpage)
1079         if video_description: video_description = clean_html(video_description)
1080         else: video_description = ''
1081
1082         # Extract upload date
1083         video_upload_date = None
1084         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085         if mobj is not None:
1086             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1087
1088         # Vimeo specific: extract request signature and timestamp
1089         sig = config['request']['signature']
1090         timestamp = config['request']['timestamp']
1091
1092         # Vimeo specific: extract video codec and quality information
1093         # First consider quality, then codecs, then take everything
1094         # TODO bind to format param
1095         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096         files = { 'hd': [], 'sd': [], 'other': []}
1097         for codec_name, codec_extension in codecs:
1098             if codec_name in config["video"]["files"]:
1099                 if 'hd' in config["video"]["files"][codec_name]:
1100                     files['hd'].append((codec_name, codec_extension, 'hd'))
1101                 elif 'sd' in config["video"]["files"][codec_name]:
1102                     files['sd'].append((codec_name, codec_extension, 'sd'))
1103                 else:
1104                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105
1106         for quality in ('hd', 'sd', 'other'):
1107             if len(files[quality]) > 0:
1108                 video_quality = files[quality][0][2]
1109                 video_codec = files[quality][0][0]
1110                 video_extension = files[quality][0][1]
1111                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1112                 break
1113         else:
1114             self._downloader.trouble(u'ERROR: no known codec found')
1115             return
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def __init__(self, downloader=None):
1142         InfoExtractor.__init__(self, downloader)
1143
1144     def report_download_webpage(self, video_id):
1145         """Report webpage download."""
1146         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147
1148     def report_extraction(self, video_id):
1149         """Report information extraction."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151
1152     def fetch_webpage(self, url):
1153         request = compat_urllib_request.Request(url)
1154         try:
1155             self.report_download_webpage(url)
1156             webpage = compat_urllib_request.urlopen(request).read()
1157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159             return
1160         except ValueError as err:
1161             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1162             return
1163         return webpage
1164
1165     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166         page = self.fetch_webpage(url)
1167         mobj = re.search(regex, page, regexFlags)
1168         info = {}
1169
1170         if mobj is None:
1171             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172             return
1173
1174         for (i, key, err) in matchTuples:
1175             if mobj.group(i) is None:
1176                 self._downloader.trouble(err)
1177                 return
1178             else:
1179                 info[key] = mobj.group(i)
1180
1181         return info
1182
1183     def extractLiveStream(self, url):
1184         video_lang = url.split('/')[-4]
1185         info = self.grep_webpage(
1186             url,
1187             r'src="(.*?/videothek_js.*?\.js)',
1188             0,
1189             [
1190                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1191             ]
1192         )
1193         http_host = url.split('/')[2]
1194         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195         info = self.grep_webpage(
1196             next_url,
1197             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198                 '(http://.*?\.swf).*?' +
1199                 '(rtmp://.*?)\'',
1200             re.DOTALL,
1201             [
1202                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1203                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1205             ]
1206         )
1207         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208
1209     def extractPlus7Stream(self, url):
1210         video_lang = url.split('/')[-3]
1211         info = self.grep_webpage(
1212             url,
1213             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1214             0,
1215             [
1216                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1217             ]
1218         )
1219         next_url = compat_urllib_parse.unquote(info.get('url'))
1220         info = self.grep_webpage(
1221             next_url,
1222             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1223             0,
1224             [
1225                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1226             ]
1227         )
1228         next_url = compat_urllib_parse.unquote(info.get('url'))
1229
1230         info = self.grep_webpage(
1231             next_url,
1232             r'<video id="(.*?)".*?>.*?' +
1233                 '<name>(.*?)</name>.*?' +
1234                 '<dateVideo>(.*?)</dateVideo>.*?' +
1235                 '<url quality="hd">(.*?)</url>',
1236             re.DOTALL,
1237             [
1238                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1239                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1241                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1242             ]
1243         )
1244
1245         return {
1246             'id':           info.get('id'),
1247             'url':          compat_urllib_parse.unquote(info.get('url')),
1248             'uploader':     u'arte.tv',
1249             'upload_date':  info.get('date'),
1250             'title':        info.get('title').decode('utf-8'),
1251             'ext':          u'mp4',
1252             'format':       u'NA',
1253             'player_url':   None,
1254         }
1255
1256     def _real_extract(self, url):
1257         video_id = url.split('/')[-1]
1258         self.report_extraction(video_id)
1259
1260         if re.search(self._LIVE_URL, video_id) is not None:
1261             self.extractLiveStream(url)
1262             return
1263         else:
1264             info = self.extractPlus7Stream(url)
1265
1266         return [info]
1267
1268
1269 class GenericIE(InfoExtractor):
1270     """Generic last-resort information extractor."""
1271
1272     _VALID_URL = r'.*'
1273     IE_NAME = u'generic'
1274
1275     def __init__(self, downloader=None):
1276         InfoExtractor.__init__(self, downloader)
1277
1278     def report_download_webpage(self, video_id):
1279         """Report webpage download."""
1280         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282
1283     def report_extraction(self, video_id):
1284         """Report information extraction."""
1285         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286
1287     def report_following_redirect(self, new_url):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1290
1291     def _test_redirect(self, url):
1292         """Check if it is a redirect, like url shorteners, in case restart chain."""
1293         class HeadRequest(compat_urllib_request.Request):
1294             def get_method(self):
1295                 return "HEAD"
1296
1297         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298             """
1299             Subclass the HTTPRedirectHandler to make it use our
1300             HeadRequest also on the redirected URL
1301             """
1302             def redirect_request(self, req, fp, code, msg, headers, newurl):
1303                 if code in (301, 302, 303, 307):
1304                     newurl = newurl.replace(' ', '%20')
1305                     newheaders = dict((k,v) for k,v in req.headers.items()
1306                                       if k.lower() not in ("content-length", "content-type"))
1307                     return HeadRequest(newurl,
1308                                        headers=newheaders,
1309                                        origin_req_host=req.get_origin_req_host(),
1310                                        unverifiable=True)
1311                 else:
1312                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1313
1314         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315             """
1316             Fallback to GET if HEAD is not allowed (405 HTTP error)
1317             """
1318             def http_error_405(self, req, fp, code, msg, headers):
1319                 fp.read()
1320                 fp.close()
1321
1322                 newheaders = dict((k,v) for k,v in req.headers.items()
1323                                   if k.lower() not in ("content-length", "content-type"))
1324                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325                                                  headers=newheaders,
1326                                                  origin_req_host=req.get_origin_req_host(),
1327                                                  unverifiable=True))
1328
1329         # Build our opener
1330         opener = compat_urllib_request.OpenerDirector()
1331         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332                         HTTPMethodFallback, HEADRedirectHandler,
1333                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334             opener.add_handler(handler())
1335
1336         response = opener.open(HeadRequest(url))
1337         new_url = response.geturl()
1338
1339         if url == new_url:
1340             return False
1341
1342         self.report_following_redirect(new_url)
1343         self._downloader.download([new_url])
1344         return True
1345
1346     def _real_extract(self, url):
1347         if self._test_redirect(url): return
1348
1349         video_id = url.split('/')[-1]
1350         request = compat_urllib_request.Request(url)
1351         try:
1352             self.report_download_webpage(video_id)
1353             webpage = compat_urllib_request.urlopen(request).read()
1354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356             return
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371             return
1372
1373         # It's possible that one of the regexes
1374         # matched, but returned an empty group:
1375         if mobj.group(1) is None:
1376             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377             return
1378
1379         video_url = compat_urllib_parse.unquote(mobj.group(1))
1380         video_id = os.path.basename(video_url)
1381
1382         # here's a fun little line of code for you:
1383         video_extension = os.path.splitext(video_id)[1][1:]
1384         video_id = os.path.splitext(video_id)[0]
1385
1386         # it's tempting to parse this further, but you would
1387         # have to take into account all the variations like
1388         #   Video Title - Site Name
1389         #   Site Name | Video Title
1390         #   Video Title - Tagline | Site Name
1391         # and so on and so forth; it's just not practical
1392         mobj = re.search(r'<title>(.*)</title>', webpage)
1393         if mobj is None:
1394             self._downloader.trouble(u'ERROR: unable to extract title')
1395             return
1396         video_title = mobj.group(1)
1397
1398         # video uploader is domain name
1399         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_uploader = mobj.group(1)
1404
1405         return [{
1406             'id':       video_id,
1407             'url':      video_url,
1408             'uploader': video_uploader,
1409             'upload_date':  None,
1410             'title':    video_title,
1411             'ext':      video_extension,
1412         }]
1413
1414
1415 class YoutubeSearchIE(InfoExtractor):
1416     """Information Extractor for YouTube search queries."""
1417     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1418     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1419     _max_youtube_results = 1000
1420     IE_NAME = u'youtube:search'
1421
1422     def __init__(self, downloader=None):
1423         InfoExtractor.__init__(self, downloader)
1424
1425     def report_download_page(self, query, pagenum):
1426         """Report attempt to download search page with given number."""
1427         query = query.decode(preferredencoding())
1428         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429
1430     def _real_extract(self, query):
1431         mobj = re.match(self._VALID_URL, query)
1432         if mobj is None:
1433             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1434             return
1435
1436         prefix, query = query.split(':')
1437         prefix = prefix[8:]
1438         query = query.encode('utf-8')
1439         if prefix == '':
1440             self._download_n_results(query, 1)
1441             return
1442         elif prefix == 'all':
1443             self._download_n_results(query, self._max_youtube_results)
1444             return
1445         else:
1446             try:
1447                 n = int(prefix)
1448                 if n <= 0:
1449                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450                     return
1451                 elif n > self._max_youtube_results:
1452                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1453                     n = self._max_youtube_results
1454                 self._download_n_results(query, n)
1455                 return
1456             except ValueError: # parsing prefix as integer fails
1457                 self._download_n_results(query, 1)
1458                 return
1459
1460     def _download_n_results(self, query, n):
1461         """Downloads a specified number of results for a query"""
1462
1463         video_ids = []
1464         pagenum = 0
1465         limit = n
1466
1467         while (50 * pagenum) < limit:
1468             self.report_download_page(query, pagenum+1)
1469             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1470             request = compat_urllib_request.Request(result_url)
1471             try:
1472                 data = compat_urllib_request.urlopen(request).read()
1473             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1474                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1475                 return
1476             api_response = json.loads(data)['data']
1477
1478             new_ids = list(video['id'] for video in api_response['items'])
1479             video_ids += new_ids
1480
1481             limit = min(n, api_response['totalItems'])
1482             pagenum += 1
1483
1484         if len(video_ids) > n:
1485             video_ids = video_ids[:n]
1486         for id in video_ids:
1487             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1488         return
1489
1490
1491 class GoogleSearchIE(InfoExtractor):
1492     """Information Extractor for Google Video search queries."""
1493     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497     _max_google_results = 1000
1498     IE_NAME = u'video.google:search'
1499
1500     def __init__(self, downloader=None):
1501         InfoExtractor.__init__(self, downloader)
1502
1503     def report_download_page(self, query, pagenum):
1504         """Report attempt to download playlist page with given number."""
1505         query = query.decode(preferredencoding())
1506         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1507
1508     def _real_extract(self, query):
1509         mobj = re.match(self._VALID_URL, query)
1510         if mobj is None:
1511             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1512             return
1513
1514         prefix, query = query.split(':')
1515         prefix = prefix[8:]
1516         query = query.encode('utf-8')
1517         if prefix == '':
1518             self._download_n_results(query, 1)
1519             return
1520         elif prefix == 'all':
1521             self._download_n_results(query, self._max_google_results)
1522             return
1523         else:
1524             try:
1525                 n = int(prefix)
1526                 if n <= 0:
1527                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1528                     return
1529                 elif n > self._max_google_results:
1530                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1531                     n = self._max_google_results
1532                 self._download_n_results(query, n)
1533                 return
1534             except ValueError: # parsing prefix as integer fails
1535                 self._download_n_results(query, 1)
1536                 return
1537
1538     def _download_n_results(self, query, n):
1539         """Downloads a specified number of results for a query"""
1540
1541         video_ids = []
1542         pagenum = 0
1543
1544         while True:
1545             self.report_download_page(query, pagenum)
1546             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1547             request = compat_urllib_request.Request(result_url)
1548             try:
1549                 page = compat_urllib_request.urlopen(request).read()
1550             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1551                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1552                 return
1553
1554             # Extract video identifiers
1555             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1556                 video_id = mobj.group(1)
1557                 if video_id not in video_ids:
1558                     video_ids.append(video_id)
1559                     if len(video_ids) == n:
1560                         # Specified n videos reached
1561                         for id in video_ids:
1562                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1563                         return
1564
1565             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1566                 for id in video_ids:
1567                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1568                 return
1569
1570             pagenum = pagenum + 1
1571
1572
1573 class YahooSearchIE(InfoExtractor):
1574     """Information Extractor for Yahoo! Video search queries."""
1575
1576     _WORKING = False
1577     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1578     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1579     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1580     _MORE_PAGES_INDICATOR = r'\s*Next'
1581     _max_yahoo_results = 1000
1582     IE_NAME = u'video.yahoo:search'
1583
1584     def __init__(self, downloader=None):
1585         InfoExtractor.__init__(self, downloader)
1586
1587     def report_download_page(self, query, pagenum):
1588         """Report attempt to download playlist page with given number."""
1589         query = query.decode(preferredencoding())
1590         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1591
1592     def _real_extract(self, query):
1593         mobj = re.match(self._VALID_URL, query)
1594         if mobj is None:
1595             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1596             return
1597
1598         prefix, query = query.split(':')
1599         prefix = prefix[8:]
1600         query = query.encode('utf-8')
1601         if prefix == '':
1602             self._download_n_results(query, 1)
1603             return
1604         elif prefix == 'all':
1605             self._download_n_results(query, self._max_yahoo_results)
1606             return
1607         else:
1608             try:
1609                 n = int(prefix)
1610                 if n <= 0:
1611                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1612                     return
1613                 elif n > self._max_yahoo_results:
1614                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615                     n = self._max_yahoo_results
1616                 self._download_n_results(query, n)
1617                 return
1618             except ValueError: # parsing prefix as integer fails
1619                 self._download_n_results(query, 1)
1620                 return
1621
1622     def _download_n_results(self, query, n):
1623         """Downloads a specified number of results for a query"""
1624
1625         video_ids = []
1626         already_seen = set()
1627         pagenum = 1
1628
1629         while True:
1630             self.report_download_page(query, pagenum)
1631             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632             request = compat_urllib_request.Request(result_url)
1633             try:
1634                 page = compat_urllib_request.urlopen(request).read()
1635             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1637                 return
1638
1639             # Extract video identifiers
1640             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641                 video_id = mobj.group(1)
1642                 if video_id not in already_seen:
1643                     video_ids.append(video_id)
1644                     already_seen.add(video_id)
1645                     if len(video_ids) == n:
1646                         # Specified n videos reached
1647                         for id in video_ids:
1648                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649                         return
1650
1651             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652                 for id in video_ids:
1653                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654                 return
1655
1656             pagenum = pagenum + 1
1657
1658
1659 class YoutubePlaylistIE(InfoExtractor):
1660     """Information Extractor for YouTube playlists."""
1661
1662     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1663     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1664     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1665     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1666     IE_NAME = u'youtube:playlist'
1667
1668     def __init__(self, downloader=None):
1669         InfoExtractor.__init__(self, downloader)
1670
1671     def report_download_page(self, playlist_id, pagenum):
1672         """Report attempt to download playlist page with given number."""
1673         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1674
1675     def _real_extract(self, url):
1676         # Extract playlist id
1677         mobj = re.match(self._VALID_URL, url)
1678         if mobj is None:
1679             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1680             return
1681
1682         # Single video case
1683         if mobj.group(3) is not None:
1684             self._downloader.download([mobj.group(3)])
1685             return
1686
1687         # Download playlist pages
1688         # prefix is 'p' as default for playlists but there are other types that need extra care
1689         playlist_prefix = mobj.group(1)
1690         if playlist_prefix == 'a':
1691             playlist_access = 'artist'
1692         else:
1693             playlist_prefix = 'p'
1694             playlist_access = 'view_play_list'
1695         playlist_id = mobj.group(2)
1696         video_ids = []
1697         pagenum = 1
1698
1699         while True:
1700             self.report_download_page(playlist_id, pagenum)
1701             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1702             request = compat_urllib_request.Request(url)
1703             try:
1704                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1705             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1706                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1707                 return
1708
1709             # Extract video identifiers
1710             ids_in_page = []
1711             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1712                 if mobj.group(1) not in ids_in_page:
1713                     ids_in_page.append(mobj.group(1))
1714             video_ids.extend(ids_in_page)
1715
1716             if self._MORE_PAGES_INDICATOR not in page:
1717                 break
1718             pagenum = pagenum + 1
1719
1720         total = len(video_ids)
1721
1722         playliststart = self._downloader.params.get('playliststart', 1) - 1
1723         playlistend = self._downloader.params.get('playlistend', -1)
1724         if playlistend == -1:
1725             video_ids = video_ids[playliststart:]
1726         else:
1727             video_ids = video_ids[playliststart:playlistend]
1728
1729         if len(video_ids) == total:
1730             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1731         else:
1732             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1733
1734         for id in video_ids:
1735             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736         return
1737
1738
1739 class YoutubeChannelIE(InfoExtractor):
1740     """Information Extractor for YouTube channels."""
1741
1742     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1743     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1745     IE_NAME = u'youtube:channel'
1746
1747     def report_download_page(self, channel_id, pagenum):
1748         """Report attempt to download channel page with given number."""
1749         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1750
1751     def _real_extract(self, url):
1752         # Extract channel id
1753         mobj = re.match(self._VALID_URL, url)
1754         if mobj is None:
1755             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1756             return
1757
1758         # Download channel pages
1759         channel_id = mobj.group(1)
1760         video_ids = []
1761         pagenum = 1
1762
1763         while True:
1764             self.report_download_page(channel_id, pagenum)
1765             url = self._TEMPLATE_URL % (channel_id, pagenum)
1766             request = compat_urllib_request.Request(url)
1767             try:
1768                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1769             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1770                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1771                 return
1772
1773             # Extract video identifiers
1774             ids_in_page = []
1775             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1776                 if mobj.group(1) not in ids_in_page:
1777                     ids_in_page.append(mobj.group(1))
1778             video_ids.extend(ids_in_page)
1779
1780             if self._MORE_PAGES_INDICATOR not in page:
1781                 break
1782             pagenum = pagenum + 1
1783
1784         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1785
1786         for id in video_ids:
1787             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1788         return
1789
1790
1791 class YoutubeUserIE(InfoExtractor):
1792     """Information Extractor for YouTube users."""
1793
1794     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1795     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1796     _GDATA_PAGE_SIZE = 50
1797     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1798     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1799     IE_NAME = u'youtube:user'
1800
1801     def __init__(self, downloader=None):
1802         InfoExtractor.__init__(self, downloader)
1803
1804     def report_download_page(self, username, start_index):
1805         """Report attempt to download user page."""
1806         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1807                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1808
1809     def _real_extract(self, url):
1810         # Extract username
1811         mobj = re.match(self._VALID_URL, url)
1812         if mobj is None:
1813             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1814             return
1815
1816         username = mobj.group(1)
1817
1818         # Download video ids using YouTube Data API. Result size per
1819         # query is limited (currently to 50 videos) so we need to query
1820         # page by page until there are no video ids - it means we got
1821         # all of them.
1822
1823         video_ids = []
1824         pagenum = 0
1825
1826         while True:
1827             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1828             self.report_download_page(username, start_index)
1829
1830             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1831
1832             try:
1833                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1834             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1836                 return
1837
1838             # Extract video identifiers
1839             ids_in_page = []
1840
1841             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1842                 if mobj.group(1) not in ids_in_page:
1843                     ids_in_page.append(mobj.group(1))
1844
1845             video_ids.extend(ids_in_page)
1846
1847             # A little optimization - if current page is not
1848             # "full", ie. does not contain PAGE_SIZE video ids then
1849             # we can assume that this page is the last one - there
1850             # are no more ids on further pages - no need to query
1851             # again.
1852
1853             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1854                 break
1855
1856             pagenum += 1
1857
1858         all_ids_count = len(video_ids)
1859         playliststart = self._downloader.params.get('playliststart', 1) - 1
1860         playlistend = self._downloader.params.get('playlistend', -1)
1861
1862         if playlistend == -1:
1863             video_ids = video_ids[playliststart:]
1864         else:
1865             video_ids = video_ids[playliststart:playlistend]
1866
1867         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1868                 (username, all_ids_count, len(video_ids)))
1869
1870         for video_id in video_ids:
1871             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1872
1873
1874 class BlipTVUserIE(InfoExtractor):
1875     """Information Extractor for blip.tv users."""
1876
1877     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1878     _PAGE_SIZE = 12
1879     IE_NAME = u'blip.tv:user'
1880
1881     def __init__(self, downloader=None):
1882         InfoExtractor.__init__(self, downloader)
1883
1884     def report_download_page(self, username, pagenum):
1885         """Report attempt to download user page."""
1886         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1887                 (self.IE_NAME, username, pagenum))
1888
1889     def _real_extract(self, url):
1890         # Extract username
1891         mobj = re.match(self._VALID_URL, url)
1892         if mobj is None:
1893             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1894             return
1895
1896         username = mobj.group(1)
1897
1898         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1899
1900         request = compat_urllib_request.Request(url)
1901
1902         try:
1903             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1904             mobj = re.search(r'data-users-id="([^"]+)"', page)
1905             page_base = page_base % mobj.group(1)
1906         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1908             return
1909
1910
1911         # Download video ids using BlipTV Ajax calls. Result size per
1912         # query is limited (currently to 12 videos) so we need to query
1913         # page by page until there are no video ids - it means we got
1914         # all of them.
1915
1916         video_ids = []
1917         pagenum = 1
1918
1919         while True:
1920             self.report_download_page(username, pagenum)
1921
1922             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1923
1924             try:
1925                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1927                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1928                 return
1929
1930             # Extract video identifiers
1931             ids_in_page = []
1932
1933             for mobj in re.finditer(r'href="/([^"]+)"', page):
1934                 if mobj.group(1) not in ids_in_page:
1935                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1936
1937             video_ids.extend(ids_in_page)
1938
1939             # A little optimization - if current page is not
1940             # "full", ie. does not contain PAGE_SIZE video ids then
1941             # we can assume that this page is the last one - there
1942             # are no more ids on further pages - no need to query
1943             # again.
1944
1945             if len(ids_in_page) < self._PAGE_SIZE:
1946                 break
1947
1948             pagenum += 1
1949
1950         all_ids_count = len(video_ids)
1951         playliststart = self._downloader.params.get('playliststart', 1) - 1
1952         playlistend = self._downloader.params.get('playlistend', -1)
1953
1954         if playlistend == -1:
1955             video_ids = video_ids[playliststart:]
1956         else:
1957             video_ids = video_ids[playliststart:playlistend]
1958
1959         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1960                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1961
1962         for video_id in video_ids:
1963             self._downloader.download([u'http://blip.tv/'+video_id])
1964
1965
1966 class DepositFilesIE(InfoExtractor):
1967     """Information extractor for depositfiles.com"""
1968
1969     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1970
1971     def report_download_webpage(self, file_id):
1972         """Report webpage download."""
1973         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1974
1975     def report_extraction(self, file_id):
1976         """Report information extraction."""
1977         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1978
1979     def _real_extract(self, url):
1980         file_id = url.split('/')[-1]
1981         # Rebuild url in english locale
1982         url = 'http://depositfiles.com/en/files/' + file_id
1983
1984         # Retrieve file webpage with 'Free download' button pressed
1985         free_download_indication = { 'gateway_result' : '1' }
1986         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1987         try:
1988             self.report_download_webpage(file_id)
1989             webpage = compat_urllib_request.urlopen(request).read()
1990         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1991             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1992             return
1993
1994         # Search for the real file URL
1995         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1996         if (mobj is None) or (mobj.group(1) is None):
1997             # Try to figure out reason of the error.
1998             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1999             if (mobj is not None) and (mobj.group(1) is not None):
2000                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2001                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2002             else:
2003                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2004             return
2005
2006         file_url = mobj.group(1)
2007         file_extension = os.path.splitext(file_url)[1][1:]
2008
2009         # Search for file title
2010         mobj = re.search(r'<b title="(.*?)">', webpage)
2011         if mobj is None:
2012             self._downloader.trouble(u'ERROR: unable to extract title')
2013             return
2014         file_title = mobj.group(1).decode('utf-8')
2015
2016         return [{
2017             'id':       file_id.decode('utf-8'),
2018             'url':      file_url.decode('utf-8'),
2019             'uploader': None,
2020             'upload_date':  None,
2021             'title':    file_title,
2022             'ext':      file_extension.decode('utf-8'),
2023         }]
2024
2025
2026 class FacebookIE(InfoExtractor):
2027     """Information Extractor for Facebook"""
2028
2029     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2030     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2031     _NETRC_MACHINE = 'facebook'
2032     IE_NAME = u'facebook'
2033
2034     def report_login(self):
2035         """Report attempt to log in."""
2036         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2037
2038     def _real_initialize(self):
2039         if self._downloader is None:
2040             return
2041
2042         useremail = None
2043         password = None
2044         downloader_params = self._downloader.params
2045
2046         # Attempt to use provided username and password or .netrc data
2047         if downloader_params.get('username', None) is not None:
2048             useremail = downloader_params['username']
2049             password = downloader_params['password']
2050         elif downloader_params.get('usenetrc', False):
2051             try:
2052                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2053                 if info is not None:
2054                     useremail = info[0]
2055                     password = info[2]
2056                 else:
2057                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2058             except (IOError, netrc.NetrcParseError) as err:
2059                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2060                 return
2061
2062         if useremail is None:
2063             return
2064
2065         # Log in
2066         login_form = {
2067             'email': useremail,
2068             'pass': password,
2069             'login': 'Log+In'
2070             }
2071         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2072         try:
2073             self.report_login()
2074             login_results = compat_urllib_request.urlopen(request).read()
2075             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2076                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2077                 return
2078         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2079             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2080             return
2081
2082     def _real_extract(self, url):
2083         mobj = re.match(self._VALID_URL, url)
2084         if mobj is None:
2085             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2086             return
2087         video_id = mobj.group('ID')
2088
2089         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2090         webpage = self._download_webpage(url, video_id)
2091
2092         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2093         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2094         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2095         if not m:
2096             raise ExtractorError(u'Cannot parse data')
2097         data = dict(json.loads(m.group(1)))
2098         params_raw = compat_urllib_parse.unquote(data['params'])
2099         params = json.loads(params_raw)
2100         video_url = params['hd_src']
2101         if not video_url:
2102             video_url = params['sd_src']
2103         if not video_url:
2104             raise ExtractorError(u'Cannot find video URL')
2105         video_duration = int(params['video_duration'])
2106
2107         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2108         if not m:
2109             raise ExtractorError(u'Cannot find title in webpage')
2110         video_title = unescapeHTML(m.group(1))
2111
2112         info = {
2113             'id': video_id,
2114             'title': video_title,
2115             'url': video_url,
2116             'ext': 'mp4',
2117             'duration': video_duration,
2118             'thumbnail': params['thumbnail_src'],
2119         }
2120         return [info]
2121
2122
2123 class BlipTVIE(InfoExtractor):
2124     """Information extractor for blip.tv"""
2125
2126     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2127     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2128     IE_NAME = u'blip.tv'
2129
2130     def report_extraction(self, file_id):
2131         """Report information extraction."""
2132         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2133
2134     def report_direct_download(self, title):
2135         """Report information extraction."""
2136         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2137
2138     def _real_extract(self, url):
2139         mobj = re.match(self._VALID_URL, url)
2140         if mobj is None:
2141             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2142             return
2143
2144         if '?' in url:
2145             cchar = '&'
2146         else:
2147             cchar = '?'
2148         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2149         request = compat_urllib_request.Request(json_url)
2150         request.add_header('User-Agent', 'iTunes/10.6.1')
2151         self.report_extraction(mobj.group(1))
2152         info = None
2153         try:
2154             urlh = compat_urllib_request.urlopen(request)
2155             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2156                 basename = url.split('/')[-1]
2157                 title,ext = os.path.splitext(basename)
2158                 title = title.decode('UTF-8')
2159                 ext = ext.replace('.', '')
2160                 self.report_direct_download(title)
2161                 info = {
2162                     'id': title,
2163                     'url': url,
2164                     'uploader': None,
2165                     'upload_date': None,
2166                     'title': title,
2167                     'ext': ext,
2168                     'urlhandle': urlh
2169                 }
2170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2172         if info is None: # Regular URL
2173             try:
2174                 json_code_bytes = urlh.read()
2175                 json_code = json_code_bytes.decode('utf-8')
2176             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2178                 return
2179
2180             try:
2181                 json_data = json.loads(json_code)
2182                 if 'Post' in json_data:
2183                     data = json_data['Post']
2184                 else:
2185                     data = json_data
2186
2187                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2188                 video_url = data['media']['url']
2189                 umobj = re.match(self._URL_EXT, video_url)
2190                 if umobj is None:
2191                     raise ValueError('Can not determine filename extension')
2192                 ext = umobj.group(1)
2193
2194                 info = {
2195                     'id': data['item_id'],
2196                     'url': video_url,
2197                     'uploader': data['display_name'],
2198                     'upload_date': upload_date,
2199                     'title': data['title'],
2200                     'ext': ext,
2201                     'format': data['media']['mimeType'],
2202                     'thumbnail': data['thumbnailUrl'],
2203                     'description': data['description'],
2204                     'player_url': data['embedUrl'],
2205                     'user_agent': 'iTunes/10.6.1',
2206                 }
2207             except (ValueError,KeyError) as err:
2208                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2209                 return
2210
2211         return [info]
2212
2213
2214 class MyVideoIE(InfoExtractor):
2215     """Information Extractor for myvideo.de."""
2216
2217     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2218     IE_NAME = u'myvideo'
2219
2220     def __init__(self, downloader=None):
2221         InfoExtractor.__init__(self, downloader)
2222
2223     def report_extraction(self, video_id):
2224         """Report information extraction."""
2225         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2226
2227     def _real_extract(self,url):
2228         mobj = re.match(self._VALID_URL, url)
2229         if mobj is None:
2230             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2231             return
2232
2233         video_id = mobj.group(1)
2234
2235         # Get video webpage
2236         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2237         webpage = self._download_webpage(webpage_url, video_id)
2238
2239         self.report_extraction(video_id)
2240         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2241                  webpage)
2242         if mobj is None:
2243             self._downloader.trouble(u'ERROR: unable to extract media URL')
2244             return
2245         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2246
2247         mobj = re.search('<title>([^<]+)</title>', webpage)
2248         if mobj is None:
2249             self._downloader.trouble(u'ERROR: unable to extract title')
2250             return
2251
2252         video_title = mobj.group(1)
2253
2254         return [{
2255             'id':       video_id,
2256             'url':      video_url,
2257             'uploader': None,
2258             'upload_date':  None,
2259             'title':    video_title,
2260             'ext':      u'flv',
2261         }]
2262
2263 class ComedyCentralIE(InfoExtractor):
2264     """Information extractor for The Daily Show and Colbert Report """
2265
2266     # urls can be abbreviations like :thedailyshow or :colbert
2267     # urls for episodes like:
2268     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2269     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2270     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2271     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2272                       |(https?://)?(www\.)?
2273                           (?P<showname>thedailyshow|colbertnation)\.com/
2274                          (full-episodes/(?P<episode>.*)|
2275                           (?P<clip>
2276                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2277                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2278                      $"""
2279
2280     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2281
2282     _video_extensions = {
2283         '3500': 'mp4',
2284         '2200': 'mp4',
2285         '1700': 'mp4',
2286         '1200': 'mp4',
2287         '750': 'mp4',
2288         '400': 'mp4',
2289     }
2290     _video_dimensions = {
2291         '3500': '1280x720',
2292         '2200': '960x540',
2293         '1700': '768x432',
2294         '1200': '640x360',
2295         '750': '512x288',
2296         '400': '384x216',
2297     }
2298
2299     def suitable(self, url):
2300         """Receives a URL and returns True if suitable for this IE."""
2301         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2302
2303     def report_extraction(self, episode_id):
2304         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2305
2306     def report_config_download(self, episode_id, media_id):
2307         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2308
2309     def report_index_download(self, episode_id):
2310         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2311
2312     def _print_formats(self, formats):
2313         print('Available formats:')
2314         for x in formats:
2315             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2316
2317
2318     def _real_extract(self, url):
2319         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2320         if mobj is None:
2321             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2322             return
2323
2324         if mobj.group('shortname'):
2325             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2326                 url = u'http://www.thedailyshow.com/full-episodes/'
2327             else:
2328                 url = u'http://www.colbertnation.com/full-episodes/'
2329             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2330             assert mobj is not None
2331
2332         if mobj.group('clip'):
2333             if mobj.group('showname') == 'thedailyshow':
2334                 epTitle = mobj.group('tdstitle')
2335             else:
2336                 epTitle = mobj.group('cntitle')
2337             dlNewest = False
2338         else:
2339             dlNewest = not mobj.group('episode')
2340             if dlNewest:
2341                 epTitle = mobj.group('showname')
2342             else:
2343                 epTitle = mobj.group('episode')
2344
2345         req = compat_urllib_request.Request(url)
2346         self.report_extraction(epTitle)
2347         try:
2348             htmlHandle = compat_urllib_request.urlopen(req)
2349             html = htmlHandle.read()
2350             webpage = html.decode('utf-8')
2351         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2352             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2353             return
2354         if dlNewest:
2355             url = htmlHandle.geturl()
2356             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2357             if mobj is None:
2358                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2359                 return
2360             if mobj.group('episode') == '':
2361                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2362                 return
2363             epTitle = mobj.group('episode')
2364
2365         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2366
2367         if len(mMovieParams) == 0:
2368             # The Colbert Report embeds the information in a without
2369             # a URL prefix; so extract the alternate reference
2370             # and then add the URL prefix manually.
2371
2372             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2373             if len(altMovieParams) == 0:
2374                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2375                 return
2376             else:
2377                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2378
2379         uri = mMovieParams[0][1]
2380         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2381         self.report_index_download(epTitle)
2382         try:
2383             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2384         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2385             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2386             return
2387
2388         results = []
2389
2390         idoc = xml.etree.ElementTree.fromstring(indexXml)
2391         itemEls = idoc.findall('.//item')
2392         for partNum,itemEl in enumerate(itemEls):
2393             mediaId = itemEl.findall('./guid')[0].text
2394             shortMediaId = mediaId.split(':')[-1]
2395             showId = mediaId.split(':')[-2].replace('.com', '')
2396             officialTitle = itemEl.findall('./title')[0].text
2397             officialDate = itemEl.findall('./pubDate')[0].text
2398
2399             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2400                         compat_urllib_parse.urlencode({'uri': mediaId}))
2401             configReq = compat_urllib_request.Request(configUrl)
2402             self.report_config_download(epTitle, shortMediaId)
2403             try:
2404                 configXml = compat_urllib_request.urlopen(configReq).read()
2405             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2407                 return
2408
2409             cdoc = xml.etree.ElementTree.fromstring(configXml)
2410             turls = []
2411             for rendition in cdoc.findall('.//rendition'):
2412                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2413                 turls.append(finfo)
2414
2415             if len(turls) == 0:
2416                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2417                 continue
2418
2419             if self._downloader.params.get('listformats', None):
2420                 self._print_formats([i[0] for i in turls])
2421                 return
2422
2423             # For now, just pick the highest bitrate
2424             format,rtmp_video_url = turls[-1]
2425
2426             # Get the format arg from the arg stream
2427             req_format = self._downloader.params.get('format', None)
2428
2429             # Select format if we can find one
2430             for f,v in turls:
2431                 if f == req_format:
2432                     format, rtmp_video_url = f, v
2433                     break
2434
2435             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2436             if not m:
2437                 raise ExtractorError(u'Cannot transform RTMP url')
2438             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2439             video_url = base + m.group('finalid')
2440
2441             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2442             info = {
2443                 'id': shortMediaId,
2444                 'url': video_url,
2445                 'uploader': showId,
2446                 'upload_date': officialDate,
2447                 'title': effTitle,
2448                 'ext': 'mp4',
2449                 'format': format,
2450                 'thumbnail': None,
2451                 'description': officialTitle,
2452             }
2453             results.append(info)
2454
2455         return results
2456
2457
2458 class EscapistIE(InfoExtractor):
2459     """Information extractor for The Escapist """
2460
2461     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2462     IE_NAME = u'escapist'
2463
2464     def report_extraction(self, showName):
2465         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2466
2467     def report_config_download(self, showName):
2468         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2469
2470     def _real_extract(self, url):
2471         mobj = re.match(self._VALID_URL, url)
2472         if mobj is None:
2473             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2474             return
2475         showName = mobj.group('showname')
2476         videoId = mobj.group('episode')
2477
2478         self.report_extraction(showName)
2479         try:
2480             webPage = compat_urllib_request.urlopen(url)
2481             webPageBytes = webPage.read()
2482             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2483             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2486             return
2487
2488         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2489         description = unescapeHTML(descMatch.group(1))
2490         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2491         imgUrl = unescapeHTML(imgMatch.group(1))
2492         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2493         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2494         configUrlMatch = re.search('config=(.*)$', playerUrl)
2495         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2496
2497         self.report_config_download(showName)
2498         try:
2499             configJSON = compat_urllib_request.urlopen(configUrl)
2500             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2501             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2502         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2503             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2504             return
2505
2506         # Technically, it's JavaScript, not JSON
2507         configJSON = configJSON.replace("'", '"')
2508
2509         try:
2510             config = json.loads(configJSON)
2511         except (ValueError,) as err:
2512             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2513             return
2514
2515         playlist = config['playlist']
2516         videoUrl = playlist[1]['url']
2517
2518         info = {
2519             'id': videoId,
2520             'url': videoUrl,
2521             'uploader': showName,
2522             'upload_date': None,
2523             'title': showName,
2524             'ext': 'flv',
2525             'thumbnail': imgUrl,
2526             'description': description,
2527             'player_url': playerUrl,
2528         }
2529
2530         return [info]
2531
2532 class CollegeHumorIE(InfoExtractor):
2533     """Information extractor for collegehumor.com"""
2534
2535     _WORKING = False
2536     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2537     IE_NAME = u'collegehumor'
2538
2539     def report_manifest(self, video_id):
2540         """Report information extraction."""
2541         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2542
2543     def report_extraction(self, video_id):
2544         """Report information extraction."""
2545         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2546
2547     def _real_extract(self, url):
2548         mobj = re.match(self._VALID_URL, url)
2549         if mobj is None:
2550             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2551             return
2552         video_id = mobj.group('videoid')
2553
2554         info = {
2555             'id': video_id,
2556             'uploader': None,
2557             'upload_date': None,
2558         }
2559
2560         self.report_extraction(video_id)
2561         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2562         try:
2563             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2564         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2566             return
2567
2568         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2569         try:
2570             videoNode = mdoc.findall('./video')[0]
2571             info['description'] = videoNode.findall('./description')[0].text
2572             info['title'] = videoNode.findall('./caption')[0].text
2573             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2574             manifest_url = videoNode.findall('./file')[0].text
2575         except IndexError:
2576             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2577             return
2578
2579         manifest_url += '?hdcore=2.10.3'
2580         self.report_manifest(video_id)
2581         try:
2582             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2583         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2584             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2585             return
2586
2587         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2588         try:
2589             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2590             node_id = media_node.attrib['url']
2591             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2592         except IndexError as err:
2593             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2594             return
2595
2596         url_pr = compat_urllib_parse_urlparse(manifest_url)
2597         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2598
2599         info['url'] = url
2600         info['ext'] = 'f4f'
2601         return [info]
2602
2603
2604 class XVideosIE(InfoExtractor):
2605     """Information extractor for xvideos.com"""
2606
2607     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2608     IE_NAME = u'xvideos'
2609
2610     def report_extraction(self, video_id):
2611         """Report information extraction."""
2612         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2613
2614     def _real_extract(self, url):
2615         mobj = re.match(self._VALID_URL, url)
2616         if mobj is None:
2617             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2618             return
2619         video_id = mobj.group(1)
2620
2621         webpage = self._download_webpage(url, video_id)
2622
2623         self.report_extraction(video_id)
2624
2625
2626         # Extract video URL
2627         mobj = re.search(r'flv_url=(.+?)&', webpage)
2628         if mobj is None:
2629             self._downloader.trouble(u'ERROR: unable to extract video url')
2630             return
2631         video_url = compat_urllib_parse.unquote(mobj.group(1))
2632
2633
2634         # Extract title
2635         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2636         if mobj is None:
2637             self._downloader.trouble(u'ERROR: unable to extract video title')
2638             return
2639         video_title = mobj.group(1)
2640
2641
2642         # Extract video thumbnail
2643         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2644         if mobj is None:
2645             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2646             return
2647         video_thumbnail = mobj.group(0)
2648
2649         info = {
2650             'id': video_id,
2651             'url': video_url,
2652             'uploader': None,
2653             'upload_date': None,
2654             'title': video_title,
2655             'ext': 'flv',
2656             'thumbnail': video_thumbnail,
2657             'description': None,
2658         }
2659
2660         return [info]
2661
2662
2663 class SoundcloudIE(InfoExtractor):
2664     """Information extractor for soundcloud.com
2665        To access the media, the uid of the song and a stream token
2666        must be extracted from the page source and the script must make
2667        a request to media.soundcloud.com/crossdomain.xml. Then
2668        the media can be grabbed by requesting from an url composed
2669        of the stream token and uid
2670      """
2671
2672     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2673     IE_NAME = u'soundcloud'
2674
2675     def __init__(self, downloader=None):
2676         InfoExtractor.__init__(self, downloader)
2677
2678     def report_resolve(self, video_id):
2679         """Report information extraction."""
2680         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2681
2682     def report_extraction(self, video_id):
2683         """Report information extraction."""
2684         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2685
2686     def _real_extract(self, url):
2687         mobj = re.match(self._VALID_URL, url)
2688         if mobj is None:
2689             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2690             return
2691
2692         # extract uploader (which is in the url)
2693         uploader = mobj.group(1)
2694         # extract simple title (uploader + slug of song title)
2695         slug_title =  mobj.group(2)
2696         simple_title = uploader + u'-' + slug_title
2697
2698         self.report_resolve('%s/%s' % (uploader, slug_title))
2699
2700         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2701         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2702         request = compat_urllib_request.Request(resolv_url)
2703         try:
2704             info_json_bytes = compat_urllib_request.urlopen(request).read()
2705             info_json = info_json_bytes.decode('utf-8')
2706         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2708             return
2709
2710         info = json.loads(info_json)
2711         video_id = info['id']
2712         self.report_extraction('%s/%s' % (uploader, slug_title))
2713
2714         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2715         request = compat_urllib_request.Request(streams_url)
2716         try:
2717             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2718             stream_json = stream_json_bytes.decode('utf-8')
2719         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2720             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2721             return
2722
2723         streams = json.loads(stream_json)
2724         mediaURL = streams['http_mp3_128_url']
2725
2726         return [{
2727             'id':       info['id'],
2728             'url':      mediaURL,
2729             'uploader': info['user']['username'],
2730             'upload_date':  info['created_at'],
2731             'title':    info['title'],
2732             'ext':      u'mp3',
2733             'description': info['description'],
2734         }]
2735
2736
2737 class InfoQIE(InfoExtractor):
2738     """Information extractor for infoq.com"""
2739     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2740
2741     def report_extraction(self, video_id):
2742         """Report information extraction."""
2743         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2744
2745     def _real_extract(self, url):
2746         mobj = re.match(self._VALID_URL, url)
2747         if mobj is None:
2748             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2749             return
2750
2751         webpage = self._download_webpage(url, video_id=url)
2752         self.report_extraction(url)
2753
2754         # Extract video URL
2755         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2756         if mobj is None:
2757             self._downloader.trouble(u'ERROR: unable to extract video url')
2758             return
2759         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2760         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2761
2762         # Extract title
2763         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2764         if mobj is None:
2765             self._downloader.trouble(u'ERROR: unable to extract video title')
2766             return
2767         video_title = mobj.group(1)
2768
2769         # Extract description
2770         video_description = u'No description available.'
2771         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2772         if mobj is not None:
2773             video_description = mobj.group(1)
2774
2775         video_filename = video_url.split('/')[-1]
2776         video_id, extension = video_filename.split('.')
2777
2778         info = {
2779             'id': video_id,
2780             'url': video_url,
2781             'uploader': None,
2782             'upload_date': None,
2783             'title': video_title,
2784             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2785             'thumbnail': None,
2786             'description': video_description,
2787         }
2788
2789         return [info]
2790
2791 class MixcloudIE(InfoExtractor):
2792     """Information extractor for www.mixcloud.com"""
2793
2794     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2795     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796     IE_NAME = u'mixcloud'
2797
2798     def __init__(self, downloader=None):
2799         InfoExtractor.__init__(self, downloader)
2800
2801     def report_download_json(self, file_id):
2802         """Report JSON download."""
2803         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2804
2805     def report_extraction(self, file_id):
2806         """Report information extraction."""
2807         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2808
2809     def get_urls(self, jsonData, fmt, bitrate='best'):
2810         """Get urls from 'audio_formats' section in json"""
2811         file_url = None
2812         try:
2813             bitrate_list = jsonData[fmt]
2814             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2815                 bitrate = max(bitrate_list) # select highest
2816
2817             url_list = jsonData[fmt][bitrate]
2818         except TypeError: # we have no bitrate info.
2819             url_list = jsonData[fmt]
2820         return url_list
2821
2822     def check_urls(self, url_list):
2823         """Returns 1st active url from list"""
2824         for url in url_list:
2825             try:
2826                 compat_urllib_request.urlopen(url)
2827                 return url
2828             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2829                 url = None
2830
2831         return None
2832
2833     def _print_formats(self, formats):
2834         print('Available formats:')
2835         for fmt in formats.keys():
2836             for b in formats[fmt]:
2837                 try:
2838                     ext = formats[fmt][b][0]
2839                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2840                 except TypeError: # we have no bitrate info
2841                     ext = formats[fmt][0]
2842                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2843                     break
2844
2845     def _real_extract(self, url):
2846         mobj = re.match(self._VALID_URL, url)
2847         if mobj is None:
2848             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849             return
2850         # extract uploader & filename from url
2851         uploader = mobj.group(1).decode('utf-8')
2852         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2853
2854         # construct API request
2855         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2856         # retrieve .json file with links to files
2857         request = compat_urllib_request.Request(file_url)
2858         try:
2859             self.report_download_json(file_url)
2860             jsonData = compat_urllib_request.urlopen(request).read()
2861         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2862             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2863             return
2864
2865         # parse JSON
2866         json_data = json.loads(jsonData)
2867         player_url = json_data['player_swf_url']
2868         formats = dict(json_data['audio_formats'])
2869
2870         req_format = self._downloader.params.get('format', None)
2871         bitrate = None
2872
2873         if self._downloader.params.get('listformats', None):
2874             self._print_formats(formats)
2875             return
2876
2877         if req_format is None or req_format == 'best':
2878             for format_param in formats.keys():
2879                 url_list = self.get_urls(formats, format_param)
2880                 # check urls
2881                 file_url = self.check_urls(url_list)
2882                 if file_url is not None:
2883                     break # got it!
2884         else:
2885             if req_format not in formats:
2886                 self._downloader.trouble(u'ERROR: format is not available')
2887                 return
2888
2889             url_list = self.get_urls(formats, req_format)
2890             file_url = self.check_urls(url_list)
2891             format_param = req_format
2892
2893         return [{
2894             'id': file_id.decode('utf-8'),
2895             'url': file_url.decode('utf-8'),
2896             'uploader': uploader.decode('utf-8'),
2897             'upload_date': None,
2898             'title': json_data['name'],
2899             'ext': file_url.split('.')[-1].decode('utf-8'),
2900             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2901             'thumbnail': json_data['thumbnail_url'],
2902             'description': json_data['description'],
2903             'player_url': player_url.decode('utf-8'),
2904         }]
2905
2906 class StanfordOpenClassroomIE(InfoExtractor):
2907     """Information extractor for Stanford's Open ClassRoom"""
2908
2909     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2910     IE_NAME = u'stanfordoc'
2911
2912     def report_download_webpage(self, objid):
2913         """Report information extraction."""
2914         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2915
2916     def report_extraction(self, video_id):
2917         """Report information extraction."""
2918         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2919
2920     def _real_extract(self, url):
2921         mobj = re.match(self._VALID_URL, url)
2922         if mobj is None:
2923             raise ExtractorError(u'Invalid URL: %s' % url)
2924
2925         if mobj.group('course') and mobj.group('video'): # A specific video
2926             course = mobj.group('course')
2927             video = mobj.group('video')
2928             info = {
2929                 'id': course + '_' + video,
2930                 'uploader': None,
2931                 'upload_date': None,
2932             }
2933
2934             self.report_extraction(info['id'])
2935             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2936             xmlUrl = baseUrl + video + '.xml'
2937             try:
2938                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2939             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2940                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2941                 return
2942             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2943             try:
2944                 info['title'] = mdoc.findall('./title')[0].text
2945                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2946             except IndexError:
2947                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2948                 return
2949             info['ext'] = info['url'].rpartition('.')[2]
2950             return [info]
2951         elif mobj.group('course'): # A course page
2952             course = mobj.group('course')
2953             info = {
2954                 'id': course,
2955                 'type': 'playlist',
2956                 'uploader': None,
2957                 'upload_date': None,
2958             }
2959
2960             coursepage = self._download_webpage(url, info['id'],
2961                                         note='Downloading course info page',
2962                                         errnote='Unable to download course info page')
2963
2964             m = re.search('<h1>([^<]+)</h1>', coursepage)
2965             if m:
2966                 info['title'] = unescapeHTML(m.group(1))
2967             else:
2968                 info['title'] = info['id']
2969
2970             m = re.search('<description>([^<]+)</description>', coursepage)
2971             if m:
2972                 info['description'] = unescapeHTML(m.group(1))
2973
2974             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2975             info['list'] = [
2976                 {
2977                     'type': 'reference',
2978                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2979                 }
2980                     for vpage in links]
2981             results = []
2982             for entry in info['list']:
2983                 assert entry['type'] == 'reference'
2984                 results += self.extract(entry['url'])
2985             return results
2986         else: # Root page
2987             info = {
2988                 'id': 'Stanford OpenClassroom',
2989                 'type': 'playlist',
2990                 'uploader': None,
2991                 'upload_date': None,
2992             }
2993
2994             self.report_download_webpage(info['id'])
2995             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2996             try:
2997                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2998             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2999                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3000                 return
3001
3002             info['title'] = info['id']
3003
3004             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3005             info['list'] = [
3006                 {
3007                     'type': 'reference',
3008                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3009                 }
3010                     for cpage in links]
3011
3012             results = []
3013             for entry in info['list']:
3014                 assert entry['type'] == 'reference'
3015                 results += self.extract(entry['url'])
3016             return results
3017
3018 class MTVIE(InfoExtractor):
3019     """Information extractor for MTV.com"""
3020
3021     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3022     IE_NAME = u'mtv'
3023
3024     def report_extraction(self, video_id):
3025         """Report information extraction."""
3026         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3027
3028     def _real_extract(self, url):
3029         mobj = re.match(self._VALID_URL, url)
3030         if mobj is None:
3031             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3032             return
3033         if not mobj.group('proto'):
3034             url = 'http://' + url
3035         video_id = mobj.group('videoid')
3036
3037         webpage = self._download_webpage(url, video_id)
3038
3039         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3040         if mobj is None:
3041             self._downloader.trouble(u'ERROR: unable to extract song name')
3042             return
3043         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3044         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3045         if mobj is None:
3046             self._downloader.trouble(u'ERROR: unable to extract performer')
3047             return
3048         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3049         video_title = performer + ' - ' + song_name
3050
3051         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3052         if mobj is None:
3053             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3054             return
3055         mtvn_uri = mobj.group(1)
3056
3057         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3058         if mobj is None:
3059             self._downloader.trouble(u'ERROR: unable to extract content id')
3060             return
3061         content_id = mobj.group(1)
3062
3063         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3064         self.report_extraction(video_id)
3065         request = compat_urllib_request.Request(videogen_url)
3066         try:
3067             metadataXml = compat_urllib_request.urlopen(request).read()
3068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3069             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3070             return
3071
3072         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3073         renditions = mdoc.findall('.//rendition')
3074
3075         # For now, always pick the highest quality.
3076         rendition = renditions[-1]
3077
3078         try:
3079             _,_,ext = rendition.attrib['type'].partition('/')
3080             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3081             video_url = rendition.find('./src').text
3082         except KeyError:
3083             self._downloader.trouble('Invalid rendition field.')
3084             return
3085
3086         info = {
3087             'id': video_id,
3088             'url': video_url,
3089             'uploader': performer,
3090             'upload_date': None,
3091             'title': video_title,
3092             'ext': ext,
3093             'format': format,
3094         }
3095
3096         return [info]
3097
3098
3099 class YoukuIE(InfoExtractor):
3100     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3101
3102     def report_download_webpage(self, file_id):
3103         """Report webpage download."""
3104         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3105
3106     def report_extraction(self, file_id):
3107         """Report information extraction."""
3108         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3109
3110     def _gen_sid(self):
3111         nowTime = int(time.time() * 1000)
3112         random1 = random.randint(1000,1998)
3113         random2 = random.randint(1000,9999)
3114
3115         return "%d%d%d" %(nowTime,random1,random2)
3116
3117     def _get_file_ID_mix_string(self, seed):
3118         mixed = []
3119         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3120         seed = float(seed)
3121         for i in range(len(source)):
3122             seed  =  (seed * 211 + 30031 ) % 65536
3123             index  =  math.floor(seed / 65536 * len(source) )
3124             mixed.append(source[int(index)])
3125             source.remove(source[int(index)])
3126         #return ''.join(mixed)
3127         return mixed
3128
3129     def _get_file_id(self, fileId, seed):
3130         mixed = self._get_file_ID_mix_string(seed)
3131         ids = fileId.split('*')
3132         realId = []
3133         for ch in ids:
3134             if ch:
3135                 realId.append(mixed[int(ch)])
3136         return ''.join(realId)
3137
3138     def _real_extract(self, url):
3139         mobj = re.match(self._VALID_URL, url)
3140         if mobj is None:
3141             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3142             return
3143         video_id = mobj.group('ID')
3144
3145         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3146
3147         request = compat_urllib_request.Request(info_url, None, std_headers)
3148         try:
3149             self.report_download_webpage(video_id)
3150             jsondata = compat_urllib_request.urlopen(request).read()
3151         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3152             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3153             return
3154
3155         self.report_extraction(video_id)
3156         try:
3157             jsonstr = jsondata.decode('utf-8')
3158             config = json.loads(jsonstr)
3159
3160             video_title =  config['data'][0]['title']
3161             seed = config['data'][0]['seed']
3162
3163             format = self._downloader.params.get('format', None)
3164             supported_format = list(config['data'][0]['streamfileids'].keys())
3165
3166             if format is None or format == 'best':
3167                 if 'hd2' in supported_format:
3168                     format = 'hd2'
3169                 else:
3170                     format = 'flv'
3171                 ext = u'flv'
3172             elif format == 'worst':
3173                 format = 'mp4'
3174                 ext = u'mp4'
3175             else:
3176                 format = 'flv'
3177                 ext = u'flv'
3178
3179
3180             fileid = config['data'][0]['streamfileids'][format]
3181             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3182         except (UnicodeDecodeError, ValueError, KeyError):
3183             self._downloader.trouble(u'ERROR: unable to extract info section')
3184             return
3185
3186         files_info=[]
3187         sid = self._gen_sid()
3188         fileid = self._get_file_id(fileid, seed)
3189
3190         #column 8,9 of fileid represent the segment number
3191         #fileid[7:9] should be changed
3192         for index, key in enumerate(keys):
3193
3194             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3195             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3196
3197             info = {
3198                 'id': '%s_part%02d' % (video_id, index),
3199                 'url': download_url,
3200                 'uploader': None,
3201                 'upload_date': None,
3202                 'title': video_title,
3203                 'ext': ext,
3204             }
3205             files_info.append(info)
3206
3207         return files_info
3208
3209
3210 class XNXXIE(InfoExtractor):
3211     """Information extractor for xnxx.com"""
3212
3213     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3214     IE_NAME = u'xnxx'
3215     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3216     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3217     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3218
3219     def report_webpage(self, video_id):
3220         """Report information extraction"""
3221         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3222
3223     def report_extraction(self, video_id):
3224         """Report information extraction"""
3225         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3226
3227     def _real_extract(self, url):
3228         mobj = re.match(self._VALID_URL, url)
3229         if mobj is None:
3230             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3231             return
3232         video_id = mobj.group(1)
3233
3234         self.report_webpage(video_id)
3235
3236         # Get webpage content
3237         try:
3238             webpage_bytes = compat_urllib_request.urlopen(url).read()
3239             webpage = webpage_bytes.decode('utf-8')
3240         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3241             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3242             return
3243
3244         result = re.search(self.VIDEO_URL_RE, webpage)
3245         if result is None:
3246             self._downloader.trouble(u'ERROR: unable to extract video url')
3247             return
3248         video_url = compat_urllib_parse.unquote(result.group(1))
3249
3250         result = re.search(self.VIDEO_TITLE_RE, webpage)
3251         if result is None:
3252             self._downloader.trouble(u'ERROR: unable to extract video title')
3253             return
3254         video_title = result.group(1)
3255
3256         result = re.search(self.VIDEO_THUMB_RE, webpage)
3257         if result is None:
3258             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3259             return
3260         video_thumbnail = result.group(1)
3261
3262         return [{
3263             'id': video_id,
3264             'url': video_url,
3265             'uploader': None,
3266             'upload_date': None,
3267             'title': video_title,
3268             'ext': 'flv',
3269             'thumbnail': video_thumbnail,
3270             'description': None,
3271         }]
3272
3273
3274 class GooglePlusIE(InfoExtractor):
3275     """Information extractor for plus.google.com."""
3276
3277     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3278     IE_NAME = u'plus.google'
3279
3280     def __init__(self, downloader=None):
3281         InfoExtractor.__init__(self, downloader)
3282
3283     def report_extract_entry(self, url):
3284         """Report downloading extry"""
3285         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3286
3287     def report_date(self, upload_date):
3288         """Report downloading extry"""
3289         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3290
3291     def report_uploader(self, uploader):
3292         """Report downloading extry"""
3293         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3294
3295     def report_title(self, video_title):
3296         """Report downloading extry"""
3297         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3298
3299     def report_extract_vid_page(self, video_page):
3300         """Report information extraction."""
3301         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3302
3303     def _real_extract(self, url):
3304         # Extract id from URL
3305         mobj = re.match(self._VALID_URL, url)
3306         if mobj is None:
3307             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3308             return
3309
3310         post_url = mobj.group(0)
3311         video_id = mobj.group(1)
3312
3313         video_extension = 'flv'
3314
3315         # Step 1, Retrieve post webpage to extract further information
3316         self.report_extract_entry(post_url)
3317         request = compat_urllib_request.Request(post_url)
3318         try:
3319             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3320         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3322             return
3323
3324         # Extract update date
3325         upload_date = None
3326         pattern = 'title="Timestamp">(.*?)</a>'
3327         mobj = re.search(pattern, webpage)
3328         if mobj:
3329             upload_date = mobj.group(1)
3330             # Convert timestring to a format suitable for filename
3331             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3332             upload_date = upload_date.strftime('%Y%m%d')
3333         self.report_date(upload_date)
3334
3335         # Extract uploader
3336         uploader = None
3337         pattern = r'rel\="author".*?>(.*?)</a>'
3338         mobj = re.search(pattern, webpage)
3339         if mobj:
3340             uploader = mobj.group(1)
3341         self.report_uploader(uploader)
3342
3343         # Extract title
3344         # Get the first line for title
3345         video_title = u'NA'
3346         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3347         mobj = re.search(pattern, webpage)
3348         if mobj:
3349             video_title = mobj.group(1)
3350         self.report_title(video_title)
3351
3352         # Step 2, Stimulate clicking the image box to launch video
3353         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3354         mobj = re.search(pattern, webpage)
3355         if mobj is None:
3356             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3357
3358         video_page = mobj.group(1)
3359         request = compat_urllib_request.Request(video_page)
3360         try:
3361             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3362         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3363             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3364             return
3365         self.report_extract_vid_page(video_page)
3366
3367
3368         # Extract video links on video page
3369         """Extract video links of all sizes"""
3370         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3371         mobj = re.findall(pattern, webpage)
3372         if len(mobj) == 0:
3373             self._downloader.trouble(u'ERROR: unable to extract video links')
3374
3375         # Sort in resolution
3376         links = sorted(mobj)
3377
3378         # Choose the lowest of the sort, i.e. highest resolution
3379         video_url = links[-1]
3380         # Only get the url. The resolution part in the tuple has no use anymore
3381         video_url = video_url[-1]
3382         # Treat escaped \u0026 style hex
3383         try:
3384             video_url = video_url.decode("unicode_escape")
3385         except AttributeError: # Python 3
3386             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3387
3388
3389         return [{
3390             'id':       video_id,
3391             'url':      video_url,
3392             'uploader': uploader,
3393             'upload_date':  upload_date,
3394             'title':    video_title,
3395             'ext':      video_extension,
3396         }]
3397
3398 class NBAIE(InfoExtractor):
3399     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3400     IE_NAME = u'nba'
3401
3402     def _real_extract(self, url):
3403         mobj = re.match(self._VALID_URL, url)
3404         if mobj is None:
3405             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3406             return
3407
3408         video_id = mobj.group(1)
3409         if video_id.endswith('/index.html'):
3410             video_id = video_id[:-len('/index.html')]
3411
3412         webpage = self._download_webpage(url, video_id)
3413
3414         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3415         def _findProp(rexp, default=None):
3416             m = re.search(rexp, webpage)
3417             if m:
3418                 return unescapeHTML(m.group(1))
3419             else:
3420                 return default
3421
3422         shortened_video_id = video_id.rpartition('/')[2]
3423         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3424         info = {
3425             'id': shortened_video_id,
3426             'url': video_url,
3427             'ext': 'mp4',
3428             'title': title,
3429             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3430             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3431         }
3432         return [info]
3433
3434 class JustinTVIE(InfoExtractor):
3435     """Information extractor for justin.tv and twitch.tv"""
3436     # TODO: One broadcast may be split into multiple videos. The key
3437     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3438     # starts at 1 and increases. Can we treat all parts as one video?
3439
3440     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3441         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3442     _JUSTIN_PAGE_LIMIT = 100
3443     IE_NAME = u'justin.tv'
3444
3445     def report_extraction(self, file_id):
3446         """Report information extraction."""
3447         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3448
3449     def report_download_page(self, channel, offset):
3450         """Report attempt to download a single page of videos."""
3451         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3452                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3453
3454     # Return count of items, list of *valid* items
3455     def _parse_page(self, url):
3456         try:
3457             urlh = compat_urllib_request.urlopen(url)
3458             webpage_bytes = urlh.read()
3459             webpage = webpage_bytes.decode('utf-8', 'ignore')
3460         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3461             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3462             return
3463
3464         response = json.loads(webpage)
3465         if type(response) != list:
3466             error_text = response.get('error', 'unknown error')
3467             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3468             return
3469         info = []
3470         for clip in response:
3471             video_url = clip['video_file_url']
3472             if video_url:
3473                 video_extension = os.path.splitext(video_url)[1][1:]
3474                 video_date = re.sub('-', '', clip['start_time'][:10])
3475                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3476                 video_id = clip['id']
3477                 video_title = clip.get('title', video_id)
3478                 info.append({
3479                     'id': video_id,
3480                     'url': video_url,
3481                     'title': video_title,
3482                     'uploader': clip.get('channel_name', video_uploader_id),
3483                     'uploader_id': video_uploader_id,
3484                     'upload_date': video_date,
3485                     'ext': video_extension,
3486                 })
3487         return (len(response), info)
3488
3489     def _real_extract(self, url):
3490         mobj = re.match(self._VALID_URL, url)
3491         if mobj is None:
3492             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3493             return
3494
3495         api = 'http://api.justin.tv'
3496         video_id = mobj.group(mobj.lastindex)
3497         paged = False
3498         if mobj.lastindex == 1:
3499             paged = True
3500             api += '/channel/archives/%s.json'
3501         else:
3502             api += '/broadcast/by_archive/%s.json'
3503         api = api % (video_id,)
3504
3505         self.report_extraction(video_id)
3506
3507         info = []
3508         offset = 0
3509         limit = self._JUSTIN_PAGE_LIMIT
3510         while True:
3511             if paged:
3512                 self.report_download_page(video_id, offset)
3513             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3514             page_count, page_info = self._parse_page(page_url)
3515             info.extend(page_info)
3516             if not paged or page_count != limit:
3517                 break
3518             offset += limit
3519         return info
3520
3521 class FunnyOrDieIE(InfoExtractor):
3522     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3523
3524     def _real_extract(self, url):
3525         mobj = re.match(self._VALID_URL, url)
3526         if mobj is None:
3527             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3528             return
3529
3530         video_id = mobj.group('id')
3531         webpage = self._download_webpage(url, video_id)
3532
3533         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3534         if not m:
3535             self._downloader.trouble(u'ERROR: unable to find video information')
3536         video_url = unescapeHTML(m.group('url'))
3537
3538         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3539         if not m:
3540             self._downloader.trouble(u'Cannot find video title')
3541         title = unescapeHTML(m.group('title'))
3542
3543         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3544         if m:
3545             desc = unescapeHTML(m.group('desc'))
3546         else:
3547             desc = None
3548
3549         info = {
3550             'id': video_id,
3551             'url': video_url,
3552             'ext': 'mp4',
3553             'title': title,
3554             'description': desc,
3555         }
3556         return [info]
3557
3558 class TweetReelIE(InfoExtractor):
3559     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3560
3561     def _real_extract(self, url):
3562         mobj = re.match(self._VALID_URL, url)
3563         if mobj is None:
3564             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3565             return
3566
3567         video_id = mobj.group('id')
3568         webpage = self._download_webpage(url, video_id)
3569
3570         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3571         if not m:
3572             self._downloader.trouble(u'ERROR: Cannot find status ID')
3573         status_id = m.group(1)
3574
3575         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3576         if not m:
3577             self._downloader.trouble(u'WARNING: Cannot find description')
3578         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3579
3580         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3581         if not m:
3582             self._downloader.trouble(u'ERROR: Cannot find uploader')
3583         uploader = unescapeHTML(m.group('uploader'))
3584         uploader_id = unescapeHTML(m.group('uploader_id'))
3585
3586         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3587         if not m:
3588             self._downloader.trouble(u'ERROR: Cannot find upload date')
3589         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3590
3591         title = desc
3592         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3593
3594         info = {
3595             'id': video_id,
3596             'url': video_url,
3597             'ext': 'mov',
3598             'title': title,
3599             'description': desc,
3600             'uploader': uploader,
3601             'uploader_id': uploader_id,
3602             'internal_id': status_id,
3603             'upload_date': upload_date
3604         }
3605         return [info]
3606         
3607 class SteamIE(InfoExtractor):
3608     _VALID_URL = r"""http://store.steampowered.com/ 
3609                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3610                 (?P<gameID>\d+)/?
3611                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3612                 """
3613
3614     def suitable(self, url):
3615         """Receives a URL and returns True if suitable for this IE."""
3616         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3617
3618     def _real_extract(self, url):
3619         m = re.match(self._VALID_URL, url, re.VERBOSE)
3620         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3621         gameID = m.group('gameID')
3622         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3623         webpage = self._download_webpage(videourl, gameID)
3624         mweb = re.finditer(urlRE, webpage)
3625         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3626         titles = re.finditer(namesRE, webpage)
3627         videos = []
3628         for vid,vtitle in zip(mweb,titles):
3629             video_id = vid.group('videoID')
3630             title = vtitle.group('videoName')
3631             video_url = vid.group('videoURL')
3632             if not video_url:
3633                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3634             info = {
3635                 'id':video_id,
3636                 'url':video_url,
3637                 'ext': 'flv',
3638                 'title': unescapeHTML(title)
3639                   }
3640             videos.append(info)
3641         return videos
3642
3643 class UstreamIE(InfoExtractor):
3644     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3645     IE_NAME = u'ustream'
3646
3647     def _real_extract(self, url):
3648         m = re.match(self._VALID_URL, url)
3649         video_id = m.group('videoID')
3650         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3651         webpage = self._download_webpage(url, video_id)
3652         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3653         title = m.group('title')
3654         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3655         uploader = m.group('uploader')
3656         info = {
3657                 'id':video_id,
3658                 'url':video_url,
3659                 'ext': 'flv',
3660                 'title': title,
3661                 'uploader': uploader
3662                   }
3663         return [info]
3664
3665 class RBMARadioIE(InfoExtractor):
3666     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3667
3668     def _real_extract(self, url):
3669         m = re.match(self._VALID_URL, url)
3670         video_id = m.group('videoID')
3671
3672         webpage = self._download_webpage(url, video_id)
3673         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3674         if not m:
3675             raise ExtractorError(u'Cannot find metadata')
3676         json_data = m.group(1)
3677
3678         try:
3679             data = json.loads(json_data)
3680         except ValueError as e:
3681             raise ExtractorError(u'Invalid JSON: ' + str(e))
3682
3683         video_url = data['akamai_url'] + '&cbr=256'
3684         url_parts = compat_urllib_parse_urlparse(video_url)
3685         video_ext = url_parts.path.rpartition('.')[2]
3686         info = {
3687                 'id': video_id,
3688                 'url': video_url,
3689                 'ext': video_ext,
3690                 'title': data['title'],
3691                 'description': data.get('teaser_text'),
3692                 'location': data.get('country_of_origin'),
3693                 'uploader': data.get('host', {}).get('name'),
3694                 'uploader_id': data.get('host', {}).get('slug'),
3695                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3696                 'duration': data.get('duration'),
3697         }
3698         return [info]
3699
3700
3701 class YouPornIE(InfoExtractor):
3702     """Information extractor for youporn.com."""
3703     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3704    
3705     def _print_formats(self, formats):
3706         """Print all available formats"""
3707         print(u'Available formats:')
3708         print(u'ext\t\tformat')
3709         print(u'---------------------------------')
3710         for format in formats:
3711             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3712
3713     def _specific(self, req_format, formats):
3714         for x in formats:
3715             if(x["format"]==req_format):
3716                 return x
3717         return None
3718
3719     def _real_extract(self, url):
3720         mobj = re.match(self._VALID_URL, url)
3721         if mobj is None:
3722             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3723             return
3724
3725         video_id = mobj.group('videoid')
3726
3727         req = compat_urllib_request.Request(url)
3728         req.add_header('Cookie', 'age_verified=1')
3729         webpage = self._download_webpage(req, video_id)
3730
3731         # Get the video title
3732         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3733         if result is None:
3734             raise ExtractorError(u'ERROR: unable to extract video title')
3735         video_title = result.group('title').strip()
3736
3737         # Get the video date
3738         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3739         if result is None:
3740             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3741             upload_date = None
3742         else:
3743             upload_date = result.group('date').strip()
3744
3745         # Get the video uploader
3746         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3747         if result is None:
3748             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3749             video_uploader = None
3750         else:
3751             video_uploader = result.group('uploader').strip()
3752             video_uploader = clean_html( video_uploader )
3753
3754         # Get all of the formats available
3755         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3756         result = re.search(DOWNLOAD_LIST_RE, webpage)
3757         if result is None:
3758             raise ExtractorError(u'Unable to extract download list')
3759         download_list_html = result.group('download_list').strip()
3760
3761         # Get all of the links from the page
3762         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3763         links = re.findall(LINK_RE, download_list_html)
3764         if(len(links) == 0):
3765             raise ExtractorError(u'ERROR: no known formats available for video')
3766         
3767         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3768
3769         formats = []
3770         for link in links:
3771
3772             # A link looks like this:
3773             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3774             # A path looks like this:
3775             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3776             video_url = unescapeHTML( link )
3777             path = compat_urllib_parse_urlparse( video_url ).path
3778             extension = os.path.splitext( path )[1][1:]
3779             format = path.split('/')[4].split('_')[:2]
3780             size = format[0]
3781             bitrate = format[1]
3782             format = "-".join( format )
3783             title = u'%s-%s-%s' % (video_title, size, bitrate)
3784
3785             formats.append({
3786                 'id': video_id,
3787                 'url': video_url,
3788                 'uploader': video_uploader,
3789                 'upload_date': upload_date,
3790                 'title': title,
3791                 'ext': extension,
3792                 'format': format,
3793                 'thumbnail': None,
3794                 'description': None,
3795                 'player_url': None
3796             })
3797
3798         if self._downloader.params.get('listformats', None):
3799             self._print_formats(formats)
3800             return
3801
3802         req_format = self._downloader.params.get('format', None)
3803         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3804
3805         if req_format is None or req_format == 'best':
3806             return [formats[0]]
3807         elif req_format == 'worst':
3808             return [formats[-1]]
3809         elif req_format in ('-1', 'all'):
3810             return formats
3811         else:
3812             format = self._specific( req_format, formats )
3813             if result is None:
3814                 self._downloader.trouble(u'ERROR: requested format not available')
3815                 return
3816             return [format]
3817
3818         
3819
3820 class PornotubeIE(InfoExtractor):
3821     """Information extractor for pornotube.com."""
3822     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3823
3824     def _real_extract(self, url):
3825         mobj = re.match(self._VALID_URL, url)
3826         if mobj is None:
3827             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3828             return
3829
3830         video_id = mobj.group('videoid')
3831         video_title = mobj.group('title')
3832
3833         # Get webpage content
3834         webpage = self._download_webpage(url, video_id)
3835
3836         # Get the video URL
3837         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3838         result = re.search(VIDEO_URL_RE, webpage)
3839         if result is None:
3840             self._downloader.trouble(u'ERROR: unable to extract video url')
3841             return
3842         video_url = compat_urllib_parse.unquote(result.group('url'))
3843
3844         #Get the uploaded date
3845         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3846         result = re.search(VIDEO_UPLOADED_RE, webpage)
3847         if result is None:
3848             self._downloader.trouble(u'ERROR: unable to extract video title')
3849             return
3850         upload_date = result.group('date')
3851
3852         info = {'id': video_id,
3853                 'url': video_url,
3854                 'uploader': None,
3855                 'upload_date': upload_date,
3856                 'title': video_title,
3857                 'ext': 'flv',
3858                 'format': 'flv'}
3859
3860         return [info]
3861
3862 class YouJizzIE(InfoExtractor):
3863     """Information extractor for youjizz.com."""
3864     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3865
3866     def _real_extract(self, url):
3867         mobj = re.match(self._VALID_URL, url)
3868         if mobj is None:
3869             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3870             return
3871
3872         video_id = mobj.group('videoid')
3873
3874         # Get webpage content
3875         webpage = self._download_webpage(url, video_id)
3876
3877         # Get the video title
3878         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3879         if result is None:
3880             raise ExtractorError(u'ERROR: unable to extract video title')
3881         video_title = result.group('title').strip()
3882
3883         # Get the embed page
3884         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3885         if result is None:
3886             raise ExtractorError(u'ERROR: unable to extract embed page')
3887
3888         embed_page_url = result.group(0).strip()
3889         video_id = result.group('videoid')
3890     
3891         webpage = self._download_webpage(embed_page_url, video_id)
3892
3893         # Get the video URL
3894         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3895         if result is None:
3896             raise ExtractorError(u'ERROR: unable to extract video url')
3897         video_url = result.group('source')
3898
3899         info = {'id': video_id,
3900                 'url': video_url,
3901                 'title': video_title,
3902                 'ext': 'flv',
3903                 'format': 'flv',
3904                 'player_url': embed_page_url}
3905
3906         return [info]
3907
3908 class EightTracksIE(InfoExtractor):
3909     IE_NAME = '8tracks'
3910     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3911
3912     def _real_extract(self, url):
3913         mobj = re.match(self._VALID_URL, url)
3914         if mobj is None:
3915             raise ExtractorError(u'Invalid URL: %s' % url)
3916         playlist_id = mobj.group('id')
3917
3918         webpage = self._download_webpage(url, playlist_id)
3919
3920         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3921         if not m:
3922             raise ExtractorError(u'Cannot find trax information')
3923         json_like = m.group(1)
3924         data = json.loads(json_like)
3925
3926         session = str(random.randint(0, 1000000000))
3927         mix_id = data['id']
3928         track_count = data['tracks_count']
3929         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3930         next_url = first_url
3931         res = []
3932         for i in itertools.count():
3933             api_json = self._download_webpage(next_url, playlist_id,
3934                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3935                 errnote=u'Failed to download song information')
3936             api_data = json.loads(api_json)
3937             track_data = api_data[u'set']['track']
3938             info = {
3939                 'id': track_data['id'],
3940                 'url': track_data['track_file_stream_url'],
3941                 'title': track_data['performer'] + u' - ' + track_data['name'],
3942                 'raw_title': track_data['name'],
3943                 'uploader_id': data['user']['login'],
3944                 'ext': 'm4a',
3945             }
3946             res.append(info)
3947             if api_data['set']['at_last_track']:
3948                 break
3949             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3950         return res
3951
3952 class KeekIE(InfoExtractor):
3953     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3954     IE_NAME = u'keek'
3955
3956     def _real_extract(self, url):
3957         m = re.match(self._VALID_URL, url)
3958         video_id = m.group('videoID')
3959         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3960         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3961         webpage = self._download_webpage(url, video_id)
3962         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3963         title = unescapeHTML(m.group('title'))
3964         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3965         uploader = unescapeHTML(m.group('uploader'))
3966         info = {
3967                 'id':video_id,
3968                 'url':video_url,
3969                 'ext': 'mp4',
3970                 'title': title,
3971                 'thumbnail': thumbnail,
3972                 'uploader': uploader
3973         }
3974         return [info]
3975
3976 class TEDIE(InfoExtractor):
3977     _VALID_URL=r'''http://www.ted.com/
3978                    (
3979                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3980                         |
3981                         ((?P<type_talk>talks)) # We have a simple talk
3982                    )
3983                    /(?P<name>\w+) # Here goes the name and then ".html"
3984                    '''
3985
3986     def suitable(self, url):
3987         """Receives a URL and returns True if suitable for this IE."""
3988         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3989
3990     def _real_extract(self, url):
3991         m=re.match(self._VALID_URL, url, re.VERBOSE)
3992         if m.group('type_talk'):
3993             return [self._talk_info(url)]
3994         else :
3995             playlist_id=m.group('playlist_id')
3996             name=m.group('name')
3997             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3998             return self._playlist_videos_info(url,name,playlist_id)
3999
4000     def _talk_video_link(self,mediaSlug):
4001         '''Returns the video link for that mediaSlug'''
4002         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4003
4004     def _playlist_videos_info(self,url,name,playlist_id=0):
4005         '''Returns the videos of the playlist'''
4006         video_RE=r'''
4007                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4008                      ([.\s]*?)data-playlist_item_id="(\d+)"
4009                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4010                      '''
4011         video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4012         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4013         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4014         m_names=re.finditer(video_name_RE,webpage)
4015         info=[]
4016         for m_video, m_name in zip(m_videos,m_names):
4017             video_dic={
4018                        'id': m_video.group('video_id'),
4019                        'url': self._talk_video_link(m_video.group('mediaSlug')),
4020                        'ext': 'mp4',
4021                        'title': m_name.group('fullname')
4022                        }
4023             info.append(video_dic)
4024         return info
4025     def _talk_info(self, url, video_id=0):
4026         """Return the video for the talk in the url"""
4027         m=re.match(self._VALID_URL, url,re.VERBOSE)
4028         videoName=m.group('name')
4029         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4030         # If the url includes the language we get the title translated
4031         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4032         title=re.search(title_RE, webpage).group('title')
4033         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4034                         "id":(?P<videoID>[\d]+).*?
4035                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4036         info_match=re.search(info_RE,webpage,re.VERBOSE)
4037         video_id=info_match.group('videoID')
4038         mediaSlug=info_match.group('mediaSlug')
4039         video_url=self._talk_video_link(mediaSlug)
4040         info = {
4041                 'id': video_id,
4042                 'url': video_url,
4043                 'ext': 'mp4',
4044                 'title': title
4045                 }
4046         return info
4047
4048 class MySpassIE(InfoExtractor):
4049     _VALID_URL = r'http://www.myspass.de/.*'
4050     
4051     def _real_extract(self, url):
4052         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4053
4054         # video id is the last path element of the URL
4055         # usually there is a trailing slash, so also try the second but last
4056         url_path = compat_urllib_parse_urlparse(url).path
4057         url_parent_path, video_id = os.path.split(url_path)
4058         if not video_id:
4059             _, video_id = os.path.split(url_parent_path)
4060         
4061         # get metadata
4062         metadata_url = META_DATA_URL_TEMPLATE % video_id
4063         metadata_text = self._download_webpage(metadata_url, video_id)
4064         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4065         
4066         # extract values from metadata
4067         url_flv_el = metadata.find('url_flv')
4068         if url_flv_el is None:
4069             self._downloader.trouble(u'ERROR: unable to extract download url')
4070             return
4071         video_url = url_flv_el.text
4072         extension = os.path.splitext(video_url)[1][1:]
4073         title_el = metadata.find('title')
4074         if title_el is None:
4075             self._downloader.trouble(u'ERROR: unable to extract title')
4076             return
4077         title = title_el.text
4078         format_id_el = metadata.find('format_id')
4079         if format_id_el is None:
4080             format = ext
4081         else:
4082             format = format_id_el.text
4083         description_el = metadata.find('description')
4084         if description_el is not None:
4085             description = description_el.text
4086         else:
4087             description = None
4088         imagePreview_el = metadata.find('imagePreview')
4089         if imagePreview_el is not None:
4090             thumbnail = imagePreview_el.text
4091         else:
4092             thumbnail = None
4093         info = {
4094             'id': video_id,
4095             'url': video_url,
4096             'title': title,
4097             'ext': extension,
4098             'format': format,
4099             'thumbnail': thumbnail,
4100             'description': description
4101         }
4102         return [info]
4103
4104 def gen_extractors():
4105     """ Return a list of an instance of every supported extractor.
4106     The order does matter; the first extractor matched is the one handling the URL.
4107     """
4108     return [
4109         YoutubePlaylistIE(),
4110         YoutubeChannelIE(),
4111         YoutubeUserIE(),
4112         YoutubeSearchIE(),
4113         YoutubeIE(),
4114         MetacafeIE(),
4115         DailymotionIE(),
4116         GoogleSearchIE(),
4117         PhotobucketIE(),
4118         YahooIE(),
4119         YahooSearchIE(),
4120         DepositFilesIE(),
4121         FacebookIE(),
4122         BlipTVUserIE(),
4123         BlipTVIE(),
4124         VimeoIE(),
4125         MyVideoIE(),
4126         ComedyCentralIE(),
4127         EscapistIE(),
4128         CollegeHumorIE(),
4129         XVideosIE(),
4130         SoundcloudIE(),
4131         InfoQIE(),
4132         MixcloudIE(),
4133         StanfordOpenClassroomIE(),
4134         MTVIE(),
4135         YoukuIE(),
4136         XNXXIE(),
4137         YouJizzIE(),
4138         PornotubeIE(),
4139         YouPornIE(),
4140         GooglePlusIE(),
4141         ArteTvIE(),
4142         NBAIE(),
4143         JustinTVIE(),
4144         FunnyOrDieIE(),
4145         TweetReelIE(),
4146         SteamIE(),
4147         UstreamIE(),
4148         RBMARadioIE(),
4149         EightTracksIE(),
4150         KeekIE(),
4151         TEDIE(),
4152         MySpassIE(),
4153         GenericIE()
4154     ]
4155
4156