Allow downloading http://blip.tv/play/ embeded URLs
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The .srt file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         webpage_bytes = urlh.read()
130         return webpage_bytes.decode('utf-8', 'replace')
131
132
133 class YoutubeIE(InfoExtractor):
134     """Information extractor for youtube.com."""
135
136     _VALID_URL = r"""^
137                      (
138                          (?:https?://)?                                       # http(s):// (optional)
139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
142                          (?:                                                  # the various things that can precede the ID:
143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
144                              |(?:                                             # or the v= param in all its forms
145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
148                                  v=
149                              )
150                          )?                                                   # optional -> youtube.com/xxxx is OK
151                      )?                                                       # all until now is optional -> you can pass the naked ID
152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
153                      (?(1).+)?                                                # if we found the ID, everything can follow
154                      $"""
155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159     _NETRC_MACHINE = 'youtube'
160     # Listed in order of quality
161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163     _video_extensions = {
164         '13': '3gp',
165         '17': 'mp4',
166         '18': 'mp4',
167         '22': 'mp4',
168         '37': 'mp4',
169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
170         '43': 'webm',
171         '44': 'webm',
172         '45': 'webm',
173         '46': 'webm',
174     }
175     _video_dimensions = {
176         '5': '240x400',
177         '6': '???',
178         '13': '???',
179         '17': '144x176',
180         '18': '360x640',
181         '22': '720x1280',
182         '34': '360x640',
183         '35': '480x854',
184         '37': '1080x1920',
185         '38': '3072x4096',
186         '43': '360x640',
187         '44': '480x854',
188         '45': '720x1280',
189         '46': '1080x1920',
190     }
191     IE_NAME = u'youtube'
192
193     @classmethod
194     def suitable(cls, url):
195         """Receives a URL and returns True if suitable for this IE."""
196         if YoutubePlaylistIE.suitable(url): return False
197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
198
199     def report_lang(self):
200         """Report attempt to set language."""
201         self._downloader.to_screen(u'[youtube] Setting language')
202
203     def report_login(self):
204         """Report attempt to log in."""
205         self._downloader.to_screen(u'[youtube] Logging in')
206
207     def report_age_confirmation(self):
208         """Report attempt to confirm age."""
209         self._downloader.to_screen(u'[youtube] Confirming age')
210
211     def report_video_webpage_download(self, video_id):
212         """Report attempt to download video webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
214
215     def report_video_info_webpage_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
218
219     def report_video_subtitles_download(self, video_id):
220         """Report attempt to download video info webpage."""
221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
222
223     def report_information_extraction(self, video_id):
224         """Report attempt to extract video information."""
225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227     def report_unavailable_format(self, video_id, format):
228         """Report extracted video URL."""
229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231     def report_rtmp_download(self):
232         """Indicate the download will use the RTMP protocol."""
233         self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235     def _closed_captions_xml_to_srt(self, xml_string):
236         srt = ''
237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238         # TODO parse xml instead of regex
239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
240             if not dur: dur = '4'
241             start = float(start)
242             end = start + float(dur)
243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245             caption = unescapeHTML(caption)
246             caption = unescapeHTML(caption) # double cycle, intentional
247             srt += str(n+1) + '\n'
248             srt += start + ' --> ' + end + '\n'
249             srt += caption + '\n\n'
250         return srt
251
252     def _extract_subtitles(self, video_id):
253         self.report_video_subtitles_download(video_id)
254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
255         try:
256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261         if not srt_lang_list:
262             return (u'WARNING: video has no closed captions', None)
263         if self._downloader.params.get('subtitleslang', False):
264             srt_lang = self._downloader.params.get('subtitleslang')
265         elif 'en' in srt_lang_list:
266             srt_lang = 'en'
267         else:
268             srt_lang = list(srt_lang_list.keys())[0]
269         if not srt_lang in srt_lang_list:
270             return (u'WARNING: no closed captions found in the specified language', None)
271         params = compat_urllib_parse.urlencode({
272             'lang': srt_lang,
273             'name': srt_lang_list[srt_lang].encode('utf-8'),
274             'v': video_id,
275         })
276         url = 'http://www.youtube.com/api/timedtext?' + params
277         try:
278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281         if not srt_xml:
282             return (u'WARNING: Did not fetch video subtitles', None)
283         return (None, self._closed_captions_xml_to_srt(srt_xml))
284
285     def _print_formats(self, formats):
286         print('Available formats:')
287         for x in formats:
288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
289
290     def _real_initialize(self):
291         if self._downloader is None:
292             return
293
294         username = None
295         password = None
296         downloader_params = self._downloader.params
297
298         # Attempt to use provided username and password or .netrc data
299         if downloader_params.get('username', None) is not None:
300             username = downloader_params['username']
301             password = downloader_params['password']
302         elif downloader_params.get('usenetrc', False):
303             try:
304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305                 if info is not None:
306                     username = info[0]
307                     password = info[2]
308                 else:
309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310             except (IOError, netrc.NetrcParseError) as err:
311                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
312                 return
313
314         # Set language
315         request = compat_urllib_request.Request(self._LANG_URL)
316         try:
317             self.report_lang()
318             compat_urllib_request.urlopen(request).read()
319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
321             return
322
323         # No authentication to be performed
324         if username is None:
325             return
326
327         request = compat_urllib_request.Request(self._LOGIN_URL)
328         try:
329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332             return
333
334         galx = None
335         dsh = None
336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           galx = match.group(1)
339
340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
341         if match:
342           dsh = match.group(1)
343
344         # Log in
345         login_form_strs = {
346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
347                 u'Email': username,
348                 u'GALX': galx,
349                 u'Passwd': password,
350                 u'PersistentCookie': u'yes',
351                 u'_utf8': u'霱',
352                 u'bgresponse': u'js_disabled',
353                 u'checkConnection': u'',
354                 u'checkedDomains': u'youtube',
355                 u'dnConn': u'',
356                 u'dsh': dsh,
357                 u'pstMsg': u'0',
358                 u'rmShown': u'1',
359                 u'secTok': u'',
360                 u'signIn': u'Sign in',
361                 u'timeStmp': u'',
362                 u'service': u'youtube',
363                 u'uilel': u'3',
364                 u'hl': u'en_US',
365         }
366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
367         # chokes on unicode
368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
371         try:
372             self.report_login()
373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
376                 return
377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
379             return
380
381         # Confirm age
382         age_form = {
383                 'next_url':     '/',
384                 'action_confirm':   'Confirm',
385                 }
386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
387         try:
388             self.report_age_confirmation()
389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
392             return
393
394     def _extract_id(self, url):
395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
396         if mobj is None:
397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
398             return
399         video_id = mobj.group(2)
400         return video_id
401
402     def _real_extract(self, url):
403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404         mobj = re.search(self._NEXT_URL_RE, url)
405         if mobj:
406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407         video_id = self._extract_id(url)
408
409         # Get video webpage
410         self.report_video_webpage_download(video_id)
411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412         request = compat_urllib_request.Request(url)
413         try:
414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
417             return
418
419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
420
421         # Attempt to extract SWF player URL
422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
423         if mobj is not None:
424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425         else:
426             player_url = None
427
428         # Get video info
429         self.report_video_info_webpage_download(video_id)
430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432                     % (video_id, el_type))
433             request = compat_urllib_request.Request(video_info_url)
434             try:
435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437                 video_info = compat_parse_qs(video_info_webpage)
438                 if 'token' in video_info:
439                     break
440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
442                 return
443         if 'token' not in video_info:
444             if 'reason' in video_info:
445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
446             else:
447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
448             return
449
450         # Check for "rental" videos
451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
453             return
454
455         # Start extracting information
456         self.report_information_extraction(video_id)
457
458         # uploader
459         if 'author' not in video_info:
460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
461             return
462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
463
464         # uploader_id
465         video_uploader_id = None
466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
467         if mobj is not None:
468             video_uploader_id = mobj.group(1)
469         else:
470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
471
472         # title
473         if 'title' not in video_info:
474             self._downloader.trouble(u'ERROR: unable to extract video title')
475             return
476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
477
478         # thumbnail image
479         if 'thumbnail_url' not in video_info:
480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
481             video_thumbnail = ''
482         else:   # don't panic if we can't find it
483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
484
485         # upload date
486         upload_date = None
487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
488         if mobj is not None:
489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491             for expression in format_expressions:
492                 try:
493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494                 except:
495                     pass
496
497         # description
498         video_description = get_element_by_id("eow-description", video_webpage)
499         if video_description:
500             video_description = clean_html(video_description)
501         else:
502             video_description = ''
503
504         # closed captions
505         video_subtitles = None
506         if self._downloader.params.get('writesubtitles', False):
507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
508             if srt_error:
509                 self._downloader.trouble(srt_error)
510
511         if 'length_seconds' not in video_info:
512             self._downloader.trouble(u'WARNING: unable to extract video duration')
513             video_duration = ''
514         else:
515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
516
517         # token
518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
519
520         # Decide which formats to download
521         req_format = self._downloader.params.get('format', None)
522
523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524             self.report_rtmp_download()
525             video_url_list = [(None, video_info['conn'][0])]
526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
531
532             format_limit = self._downloader.params.get('format_limit', None)
533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534             if format_limit is not None and format_limit in available_formats:
535                 format_list = available_formats[available_formats.index(format_limit):]
536             else:
537                 format_list = available_formats
538             existing_formats = [x for x in format_list if x in url_map]
539             if len(existing_formats) == 0:
540                 self._downloader.trouble(u'ERROR: no known formats available for video')
541                 return
542             if self._downloader.params.get('listformats', None):
543                 self._print_formats(existing_formats)
544                 return
545             if req_format is None or req_format == 'best':
546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547             elif req_format == 'worst':
548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549             elif req_format in ('-1', 'all'):
550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
551             else:
552                 # Specific formats. We pick the first in a slash-delimeted sequence.
553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554                 req_formats = req_format.split('/')
555                 video_url_list = None
556                 for rf in req_formats:
557                     if rf in url_map:
558                         video_url_list = [(rf, url_map[rf])]
559                         break
560                 if video_url_list is None:
561                     self._downloader.trouble(u'ERROR: requested format not available')
562                     return
563         else:
564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
565             return
566
567         results = []
568         for format_param, video_real_url in video_url_list:
569             # Extension
570             video_extension = self._video_extensions.get(format_param, 'flv')
571
572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573                                               self._video_dimensions.get(format_param, '???'))
574
575             results.append({
576                 'id':       video_id,
577                 'url':      video_real_url,
578                 'uploader': video_uploader,
579                 'uploader_id': video_uploader_id,
580                 'upload_date':  upload_date,
581                 'title':    video_title,
582                 'ext':      video_extension,
583                 'format':   video_format,
584                 'thumbnail':    video_thumbnail,
585                 'description':  video_description,
586                 'player_url':   player_url,
587                 'subtitles':    video_subtitles,
588                 'duration':     video_duration
589             })
590         return results
591
592
593 class MetacafeIE(InfoExtractor):
594     """Information Extractor for metacafe.com."""
595
596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599     IE_NAME = u'metacafe'
600
601     def __init__(self, downloader=None):
602         InfoExtractor.__init__(self, downloader)
603
604     def report_disclaimer(self):
605         """Report disclaimer retrieval."""
606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
607
608     def report_age_confirmation(self):
609         """Report attempt to confirm age."""
610         self._downloader.to_screen(u'[metacafe] Confirming age')
611
612     def report_download_webpage(self, video_id):
613         """Report webpage download."""
614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
615
616     def report_extraction(self, video_id):
617         """Report information extraction."""
618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
619
620     def _real_initialize(self):
621         # Retrieve disclaimer
622         request = compat_urllib_request.Request(self._DISCLAIMER)
623         try:
624             self.report_disclaimer()
625             disclaimer = compat_urllib_request.urlopen(request).read()
626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
628             return
629
630         # Confirm age
631         disclaimer_form = {
632             'filters': '0',
633             'submit': "Continue - I'm over 18",
634             }
635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
636         try:
637             self.report_age_confirmation()
638             disclaimer = compat_urllib_request.urlopen(request).read()
639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
641             return
642
643     def _real_extract(self, url):
644         # Extract id and simplified title from URL
645         mobj = re.match(self._VALID_URL, url)
646         if mobj is None:
647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648             return
649
650         video_id = mobj.group(1)
651
652         # Check if video comes from YouTube
653         mobj2 = re.match(r'^yt-(.*)$', video_id)
654         if mobj2 is not None:
655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
656             return
657
658         # Retrieve video webpage to extract further information
659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
660         try:
661             self.report_download_webpage(video_id)
662             webpage = compat_urllib_request.urlopen(request).read()
663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665             return
666
667         # Extract URL, uploader and title from webpage
668         self.report_extraction(video_id)
669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
670         if mobj is not None:
671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672             video_extension = mediaURL[-3:]
673
674             # Extract gdaKey if available
675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
676             if mobj is None:
677                 video_url = mediaURL
678             else:
679                 gdaKey = mobj.group(1)
680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
681         else:
682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
683             if mobj is None:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             vardict = compat_parse_qs(mobj.group(1))
687             if 'mediaData' not in vardict:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
691             if mobj is None:
692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
693                 return
694             mediaURL = mobj.group(1).replace('\\/', '/')
695             video_extension = mediaURL[-3:]
696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
697
698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
699         if mobj is None:
700             self._downloader.trouble(u'ERROR: unable to extract title')
701             return
702         video_title = mobj.group(1).decode('utf-8')
703
704         mobj = re.search(r'submitter=(.*?);', webpage)
705         if mobj is None:
706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
707             return
708         video_uploader = mobj.group(1)
709
710         return [{
711             'id':       video_id.decode('utf-8'),
712             'url':      video_url.decode('utf-8'),
713             'uploader': video_uploader.decode('utf-8'),
714             'upload_date':  None,
715             'title':    video_title,
716             'ext':      video_extension.decode('utf-8'),
717         }]
718
719
720 class DailymotionIE(InfoExtractor):
721     """Information Extractor for Dailymotion"""
722
723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724     IE_NAME = u'dailymotion'
725     _WORKING = False
726
727     def __init__(self, downloader=None):
728         InfoExtractor.__init__(self, downloader)
729
730     def report_extraction(self, video_id):
731         """Report information extraction."""
732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
733
734     def _real_extract(self, url):
735         # Extract id and simplified title from URL
736         mobj = re.match(self._VALID_URL, url)
737         if mobj is None:
738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
739             return
740
741         video_id = mobj.group(1).split('_')[0].split('?')[0]
742
743         video_extension = 'mp4'
744
745         # Retrieve video webpage to extract further information
746         request = compat_urllib_request.Request(url)
747         request.add_header('Cookie', 'family_filter=off')
748         webpage = self._download_webpage(request, video_id)
749
750         # Extract URL, uploader and title from webpage
751         self.report_extraction(video_id)
752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
753         if mobj is None:
754             self._downloader.trouble(u'ERROR: unable to extract media URL')
755             return
756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
757
758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
759             if key in flashvars:
760                 max_quality = key
761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
762                 break
763         else:
764             self._downloader.trouble(u'ERROR: unable to extract video URL')
765             return
766
767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
768         if mobj is None:
769             self._downloader.trouble(u'ERROR: unable to extract video URL')
770             return
771
772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
773
774         # TODO: support choosing qualities
775
776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
777         if mobj is None:
778             self._downloader.trouble(u'ERROR: unable to extract title')
779             return
780         video_title = unescapeHTML(mobj.group('title'))
781
782         video_uploader = None
783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
784         if mobj is None:
785             # lookin for official user
786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787             if mobj_official is None:
788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
789             else:
790                 video_uploader = mobj_official.group(1)
791         else:
792             video_uploader = mobj.group(1)
793
794         video_upload_date = None
795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
796         if mobj is not None:
797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798
799         return [{
800             'id':       video_id,
801             'url':      video_url,
802             'uploader': video_uploader,
803             'upload_date':  video_upload_date,
804             'title':    video_title,
805             'ext':      video_extension,
806         }]
807
808
809 class PhotobucketIE(InfoExtractor):
810     """Information extractor for photobucket.com."""
811
812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813     IE_NAME = u'photobucket'
814
815     def __init__(self, downloader=None):
816         InfoExtractor.__init__(self, downloader)
817
818     def report_download_webpage(self, video_id):
819         """Report webpage download."""
820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
821
822     def report_extraction(self, video_id):
823         """Report information extraction."""
824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
825
826     def _real_extract(self, url):
827         # Extract id from URL
828         mobj = re.match(self._VALID_URL, url)
829         if mobj is None:
830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
831             return
832
833         video_id = mobj.group(1)
834
835         video_extension = 'flv'
836
837         # Retrieve video webpage to extract further information
838         request = compat_urllib_request.Request(url)
839         try:
840             self.report_download_webpage(video_id)
841             webpage = compat_urllib_request.urlopen(request).read()
842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
844             return
845
846         # Extract URL, uploader, and title from webpage
847         self.report_extraction(video_id)
848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
849         if mobj is None:
850             self._downloader.trouble(u'ERROR: unable to extract media URL')
851             return
852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
853
854         video_url = mediaURL
855
856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
857         if mobj is None:
858             self._downloader.trouble(u'ERROR: unable to extract title')
859             return
860         video_title = mobj.group(1).decode('utf-8')
861
862         video_uploader = mobj.group(2).decode('utf-8')
863
864         return [{
865             'id':       video_id.decode('utf-8'),
866             'url':      video_url.decode('utf-8'),
867             'uploader': video_uploader,
868             'upload_date':  None,
869             'title':    video_title,
870             'ext':      video_extension.decode('utf-8'),
871         }]
872
873
874 class YahooIE(InfoExtractor):
875     """Information extractor for video.yahoo.com."""
876
877     _WORKING = False
878     # _VALID_URL matches all Yahoo! Video URLs
879     # _VPAGE_URL matches only the extractable '/watch/' URLs
880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882     IE_NAME = u'video.yahoo'
883
884     def __init__(self, downloader=None):
885         InfoExtractor.__init__(self, downloader)
886
887     def report_download_webpage(self, video_id):
888         """Report webpage download."""
889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
890
891     def report_extraction(self, video_id):
892         """Report information extraction."""
893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
894
895     def _real_extract(self, url, new_video=True):
896         # Extract ID from URL
897         mobj = re.match(self._VALID_URL, url)
898         if mobj is None:
899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
900             return
901
902         video_id = mobj.group(2)
903         video_extension = 'flv'
904
905         # Rewrite valid but non-extractable URLs as
906         # extractable English language /watch/ URLs
907         if re.match(self._VPAGE_URL, url) is None:
908             request = compat_urllib_request.Request(url)
909             try:
910                 webpage = compat_urllib_request.urlopen(request).read()
911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
913                 return
914
915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
916             if mobj is None:
917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
918                 return
919             yahoo_id = mobj.group(1)
920
921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
922             if mobj is None:
923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
924                 return
925             yahoo_vid = mobj.group(1)
926
927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928             return self._real_extract(url, new_video=False)
929
930         # Retrieve video webpage to extract further information
931         request = compat_urllib_request.Request(url)
932         try:
933             self.report_download_webpage(video_id)
934             webpage = compat_urllib_request.urlopen(request).read()
935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
937             return
938
939         # Extract uploader and title from webpage
940         self.report_extraction(video_id)
941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
942         if mobj is None:
943             self._downloader.trouble(u'ERROR: unable to extract video title')
944             return
945         video_title = mobj.group(1).decode('utf-8')
946
947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
948         if mobj is None:
949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
950             return
951         video_uploader = mobj.group(1).decode('utf-8')
952
953         # Extract video thumbnail
954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
955         if mobj is None:
956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
957             return
958         video_thumbnail = mobj.group(1).decode('utf-8')
959
960         # Extract video description
961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: unable to extract video description')
964             return
965         video_description = mobj.group(1).decode('utf-8')
966         if not video_description:
967             video_description = 'No description available.'
968
969         # Extract video height and width
970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
971         if mobj is None:
972             self._downloader.trouble(u'ERROR: unable to extract video height')
973             return
974         yv_video_height = mobj.group(1)
975
976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
977         if mobj is None:
978             self._downloader.trouble(u'ERROR: unable to extract video width')
979             return
980         yv_video_width = mobj.group(1)
981
982         # Retrieve video playlist to extract media URL
983         # I'm not completely sure what all these options are, but we
984         # seem to need most of them, otherwise the server sends a 401.
985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
990         try:
991             self.report_download_webpage(video_id)
992             webpage = compat_urllib_request.urlopen(request).read()
993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995             return
996
997         # Extract media URL from playlist XML
998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 self._downloader.trouble(u'[youtube] No video results')
1487                 return
1488
1489             new_ids = list(video['id'] for video in api_response['items'])
1490             video_ids += new_ids
1491
1492             limit = min(n, api_response['totalItems'])
1493             pagenum += 1
1494
1495         if len(video_ids) > n:
1496             video_ids = video_ids[:n]
1497         for id in video_ids:
1498             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499         return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503     """Information Extractor for Google Video search queries."""
1504     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508     _max_google_results = 1000
1509     IE_NAME = u'video.google:search'
1510
1511     def __init__(self, downloader=None):
1512         InfoExtractor.__init__(self, downloader)
1513
1514     def report_download_page(self, query, pagenum):
1515         """Report attempt to download playlist page with given number."""
1516         query = query.decode(preferredencoding())
1517         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519     def _real_extract(self, query):
1520         mobj = re.match(self._VALID_URL, query)
1521         if mobj is None:
1522             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523             return
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_google_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539                     return
1540                 elif n > self._max_google_results:
1541                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542                     n = self._max_google_results
1543                 self._download_n_results(query, n)
1544                 return
1545             except ValueError: # parsing prefix as integer fails
1546                 self._download_n_results(query, 1)
1547                 return
1548
1549     def _download_n_results(self, query, n):
1550         """Downloads a specified number of results for a query"""
1551
1552         video_ids = []
1553         pagenum = 0
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in video_ids:
1569                     video_ids.append(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585     """Information Extractor for Yahoo! Video search queries."""
1586
1587     _WORKING = False
1588     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591     _MORE_PAGES_INDICATOR = r'\s*Next'
1592     _max_yahoo_results = 1000
1593     IE_NAME = u'video.yahoo:search'
1594
1595     def __init__(self, downloader=None):
1596         InfoExtractor.__init__(self, downloader)
1597
1598     def report_download_page(self, query, pagenum):
1599         """Report attempt to download playlist page with given number."""
1600         query = query.decode(preferredencoding())
1601         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603     def _real_extract(self, query):
1604         mobj = re.match(self._VALID_URL, query)
1605         if mobj is None:
1606             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607             return
1608
1609         prefix, query = query.split(':')
1610         prefix = prefix[8:]
1611         query = query.encode('utf-8')
1612         if prefix == '':
1613             self._download_n_results(query, 1)
1614             return
1615         elif prefix == 'all':
1616             self._download_n_results(query, self._max_yahoo_results)
1617             return
1618         else:
1619             try:
1620                 n = int(prefix)
1621                 if n <= 0:
1622                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                     return
1624                 elif n > self._max_yahoo_results:
1625                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626                     n = self._max_yahoo_results
1627                 self._download_n_results(query, n)
1628                 return
1629             except ValueError: # parsing prefix as integer fails
1630                 self._download_n_results(query, 1)
1631                 return
1632
1633     def _download_n_results(self, query, n):
1634         """Downloads a specified number of results for a query"""
1635
1636         video_ids = []
1637         already_seen = set()
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(query, pagenum)
1642             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643             request = compat_urllib_request.Request(result_url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read()
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                 video_id = mobj.group(1)
1653                 if video_id not in already_seen:
1654                     video_ids.append(video_id)
1655                     already_seen.add(video_id)
1656                     if len(video_ids) == n:
1657                         # Specified n videos reached
1658                         for id in video_ids:
1659                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                         return
1661
1662             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                 for id in video_ids:
1664                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665                 return
1666
1667             pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671     """Information Extractor for YouTube playlists."""
1672
1673     _VALID_URL = r"""(?:
1674                         (?:https?://)?
1675                         (?:\w+\.)?
1676                         youtube\.com/
1677                         (?:
1678                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679                            \? (?:.*?&)*? (?:p|a|list)=
1680                         |  user/.*?/user/
1681                         |  p/
1682                         |  user/.*?#[pg]/c/
1683                         )
1684                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685                         .*
1686                      |
1687                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688                      )"""
1689     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690     _MAX_RESULTS = 50
1691     IE_NAME = u'youtube:playlist'
1692
1693     def __init__(self, downloader=None):
1694         InfoExtractor.__init__(self, downloader)
1695
1696     @classmethod
1697     def suitable(cls, url):
1698         """Receives a URL and returns True if suitable for this IE."""
1699         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701     def report_download_page(self, playlist_id, pagenum):
1702         """Report attempt to download playlist page with given number."""
1703         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract playlist id
1707         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download playlist videos from API
1713         playlist_id = mobj.group(1) or mobj.group(2)
1714         page_num = 1
1715         videos = []
1716
1717         while True:
1718             self.report_download_page(playlist_id, page_num)
1719
1720             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721             try:
1722                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if not 'feed' in response or not 'entry' in response['feed']:
1734                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735                 return
1736             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737                         for entry in response['feed']['entry']
1738                         if 'content' in entry ]
1739
1740             if len(response['feed']['entry']) < self._MAX_RESULTS:
1741                 break
1742             page_num += 1
1743
1744         videos = [v[1] for v in sorted(videos)]
1745         total = len(videos)
1746
1747         playliststart = self._downloader.params.get('playliststart', 1) - 1
1748         playlistend = self._downloader.params.get('playlistend', -1)
1749         if playlistend == -1:
1750             videos = videos[playliststart:]
1751         else:
1752             videos = videos[playliststart:playlistend]
1753
1754         if len(videos) == total:
1755             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1756         else:
1757             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1758
1759         for video in videos:
1760             self._downloader.download([video])
1761         return
1762
1763
1764 class YoutubeChannelIE(InfoExtractor):
1765     """Information Extractor for YouTube channels."""
1766
1767     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1768     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1769     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1770     IE_NAME = u'youtube:channel'
1771
1772     def report_download_page(self, channel_id, pagenum):
1773         """Report attempt to download channel page with given number."""
1774         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1775
1776     def _real_extract(self, url):
1777         # Extract channel id
1778         mobj = re.match(self._VALID_URL, url)
1779         if mobj is None:
1780             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1781             return
1782
1783         # Download channel pages
1784         channel_id = mobj.group(1)
1785         video_ids = []
1786         pagenum = 1
1787
1788         while True:
1789             self.report_download_page(channel_id, pagenum)
1790             url = self._TEMPLATE_URL % (channel_id, pagenum)
1791             request = compat_urllib_request.Request(url)
1792             try:
1793                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1794             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1796                 return
1797
1798             # Extract video identifiers
1799             ids_in_page = []
1800             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1801                 if mobj.group(1) not in ids_in_page:
1802                     ids_in_page.append(mobj.group(1))
1803             video_ids.extend(ids_in_page)
1804
1805             if self._MORE_PAGES_INDICATOR not in page:
1806                 break
1807             pagenum = pagenum + 1
1808
1809         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1810
1811         for id in video_ids:
1812             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1813         return
1814
1815
1816 class YoutubeUserIE(InfoExtractor):
1817     """Information Extractor for YouTube users."""
1818
1819     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1820     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1821     _GDATA_PAGE_SIZE = 50
1822     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1823     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1824     IE_NAME = u'youtube:user'
1825
1826     def __init__(self, downloader=None):
1827         InfoExtractor.__init__(self, downloader)
1828
1829     def report_download_page(self, username, start_index):
1830         """Report attempt to download user page."""
1831         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1832                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1833
1834     def _real_extract(self, url):
1835         # Extract username
1836         mobj = re.match(self._VALID_URL, url)
1837         if mobj is None:
1838             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1839             return
1840
1841         username = mobj.group(1)
1842
1843         # Download video ids using YouTube Data API. Result size per
1844         # query is limited (currently to 50 videos) so we need to query
1845         # page by page until there are no video ids - it means we got
1846         # all of them.
1847
1848         video_ids = []
1849         pagenum = 0
1850
1851         while True:
1852             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1853             self.report_download_page(username, start_index)
1854
1855             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1856
1857             try:
1858                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1861                 return
1862
1863             # Extract video identifiers
1864             ids_in_page = []
1865
1866             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                 if mobj.group(1) not in ids_in_page:
1868                     ids_in_page.append(mobj.group(1))
1869
1870             video_ids.extend(ids_in_page)
1871
1872             # A little optimization - if current page is not
1873             # "full", ie. does not contain PAGE_SIZE video ids then
1874             # we can assume that this page is the last one - there
1875             # are no more ids on further pages - no need to query
1876             # again.
1877
1878             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1879                 break
1880
1881             pagenum += 1
1882
1883         all_ids_count = len(video_ids)
1884         playliststart = self._downloader.params.get('playliststart', 1) - 1
1885         playlistend = self._downloader.params.get('playlistend', -1)
1886
1887         if playlistend == -1:
1888             video_ids = video_ids[playliststart:]
1889         else:
1890             video_ids = video_ids[playliststart:playlistend]
1891
1892         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1893                 (username, all_ids_count, len(video_ids)))
1894
1895         for video_id in video_ids:
1896             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1897
1898
1899 class BlipTVUserIE(InfoExtractor):
1900     """Information Extractor for blip.tv users."""
1901
1902     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1903     _PAGE_SIZE = 12
1904     IE_NAME = u'blip.tv:user'
1905
1906     def __init__(self, downloader=None):
1907         InfoExtractor.__init__(self, downloader)
1908
1909     def report_download_page(self, username, pagenum):
1910         """Report attempt to download user page."""
1911         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1912                 (self.IE_NAME, username, pagenum))
1913
1914     def _real_extract(self, url):
1915         # Extract username
1916         mobj = re.match(self._VALID_URL, url)
1917         if mobj is None:
1918             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1919             return
1920
1921         username = mobj.group(1)
1922
1923         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1924
1925         request = compat_urllib_request.Request(url)
1926
1927         try:
1928             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929             mobj = re.search(r'data-users-id="([^"]+)"', page)
1930             page_base = page_base % mobj.group(1)
1931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1932             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1933             return
1934
1935
1936         # Download video ids using BlipTV Ajax calls. Result size per
1937         # query is limited (currently to 12 videos) so we need to query
1938         # page by page until there are no video ids - it means we got
1939         # all of them.
1940
1941         video_ids = []
1942         pagenum = 1
1943
1944         while True:
1945             self.report_download_page(username, pagenum)
1946             url = page_base + "&page=" + str(pagenum)
1947             request = compat_urllib_request.Request( url )
1948             try:
1949                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1952                 return
1953
1954             # Extract video identifiers
1955             ids_in_page = []
1956
1957             for mobj in re.finditer(r'href="/([^"]+)"', page):
1958                 if mobj.group(1) not in ids_in_page:
1959                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1960
1961             video_ids.extend(ids_in_page)
1962
1963             # A little optimization - if current page is not
1964             # "full", ie. does not contain PAGE_SIZE video ids then
1965             # we can assume that this page is the last one - there
1966             # are no more ids on further pages - no need to query
1967             # again.
1968
1969             if len(ids_in_page) < self._PAGE_SIZE:
1970                 break
1971
1972             pagenum += 1
1973
1974         all_ids_count = len(video_ids)
1975         playliststart = self._downloader.params.get('playliststart', 1) - 1
1976         playlistend = self._downloader.params.get('playlistend', -1)
1977
1978         if playlistend == -1:
1979             video_ids = video_ids[playliststart:]
1980         else:
1981             video_ids = video_ids[playliststart:playlistend]
1982
1983         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1984                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1985
1986         for video_id in video_ids:
1987             self._downloader.download([u'http://blip.tv/'+video_id])
1988
1989
1990 class DepositFilesIE(InfoExtractor):
1991     """Information extractor for depositfiles.com"""
1992
1993     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1994
1995     def report_download_webpage(self, file_id):
1996         """Report webpage download."""
1997         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1998
1999     def report_extraction(self, file_id):
2000         """Report information extraction."""
2001         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2002
2003     def _real_extract(self, url):
2004         file_id = url.split('/')[-1]
2005         # Rebuild url in english locale
2006         url = 'http://depositfiles.com/en/files/' + file_id
2007
2008         # Retrieve file webpage with 'Free download' button pressed
2009         free_download_indication = { 'gateway_result' : '1' }
2010         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2011         try:
2012             self.report_download_webpage(file_id)
2013             webpage = compat_urllib_request.urlopen(request).read()
2014         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2015             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2016             return
2017
2018         # Search for the real file URL
2019         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2020         if (mobj is None) or (mobj.group(1) is None):
2021             # Try to figure out reason of the error.
2022             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2023             if (mobj is not None) and (mobj.group(1) is not None):
2024                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2025                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2026             else:
2027                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2028             return
2029
2030         file_url = mobj.group(1)
2031         file_extension = os.path.splitext(file_url)[1][1:]
2032
2033         # Search for file title
2034         mobj = re.search(r'<b title="(.*?)">', webpage)
2035         if mobj is None:
2036             self._downloader.trouble(u'ERROR: unable to extract title')
2037             return
2038         file_title = mobj.group(1).decode('utf-8')
2039
2040         return [{
2041             'id':       file_id.decode('utf-8'),
2042             'url':      file_url.decode('utf-8'),
2043             'uploader': None,
2044             'upload_date':  None,
2045             'title':    file_title,
2046             'ext':      file_extension.decode('utf-8'),
2047         }]
2048
2049
2050 class FacebookIE(InfoExtractor):
2051     """Information Extractor for Facebook"""
2052
2053     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2054     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2055     _NETRC_MACHINE = 'facebook'
2056     IE_NAME = u'facebook'
2057
2058     def report_login(self):
2059         """Report attempt to log in."""
2060         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2061
2062     def _real_initialize(self):
2063         if self._downloader is None:
2064             return
2065
2066         useremail = None
2067         password = None
2068         downloader_params = self._downloader.params
2069
2070         # Attempt to use provided username and password or .netrc data
2071         if downloader_params.get('username', None) is not None:
2072             useremail = downloader_params['username']
2073             password = downloader_params['password']
2074         elif downloader_params.get('usenetrc', False):
2075             try:
2076                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2077                 if info is not None:
2078                     useremail = info[0]
2079                     password = info[2]
2080                 else:
2081                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2082             except (IOError, netrc.NetrcParseError) as err:
2083                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2084                 return
2085
2086         if useremail is None:
2087             return
2088
2089         # Log in
2090         login_form = {
2091             'email': useremail,
2092             'pass': password,
2093             'login': 'Log+In'
2094             }
2095         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2096         try:
2097             self.report_login()
2098             login_results = compat_urllib_request.urlopen(request).read()
2099             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2100                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2101                 return
2102         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2103             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2104             return
2105
2106     def _real_extract(self, url):
2107         mobj = re.match(self._VALID_URL, url)
2108         if mobj is None:
2109             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2110             return
2111         video_id = mobj.group('ID')
2112
2113         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2114         webpage = self._download_webpage(url, video_id)
2115
2116         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2117         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2118         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2119         if not m:
2120             raise ExtractorError(u'Cannot parse data')
2121         data = dict(json.loads(m.group(1)))
2122         params_raw = compat_urllib_parse.unquote(data['params'])
2123         params = json.loads(params_raw)
2124         video_url = params['hd_src']
2125         if not video_url:
2126             video_url = params['sd_src']
2127         if not video_url:
2128             raise ExtractorError(u'Cannot find video URL')
2129         video_duration = int(params['video_duration'])
2130
2131         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2132         if not m:
2133             raise ExtractorError(u'Cannot find title in webpage')
2134         video_title = unescapeHTML(m.group(1))
2135
2136         info = {
2137             'id': video_id,
2138             'title': video_title,
2139             'url': video_url,
2140             'ext': 'mp4',
2141             'duration': video_duration,
2142             'thumbnail': params['thumbnail_src'],
2143         }
2144         return [info]
2145
2146
2147 class BlipTVIE(InfoExtractor):
2148     """Information extractor for blip.tv"""
2149
2150     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2151     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2152     IE_NAME = u'blip.tv'
2153
2154     def report_extraction(self, file_id):
2155         """Report information extraction."""
2156         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2157
2158     def report_direct_download(self, title):
2159         """Report information extraction."""
2160         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2166             return
2167
2168         if '/play/' in url:
2169             request = compat_urllib_request.Request(url)
2170             response = compat_urllib_request.urlopen(request)
2171             redirecturl = response.geturl()
2172             hashindex = redirecturl.find('#')
2173             if hashindex!=-1:
2174                 hash = redirecturl[hashindex+1:]
2175                 params = compat_parse_qs(hash)
2176                 files = params['file']
2177                 for file in files:
2178                     match = re.search('/(\d+)',file)
2179                     if match:
2180                         file_id = match.group(1)
2181                         url = 'http://blip.tv/a/a-'+file_id
2182
2183         if '?' in url:
2184             cchar = '&'
2185         else:
2186             cchar = '?'
2187         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2188         request = compat_urllib_request.Request(json_url)
2189         request.add_header('User-Agent', 'iTunes/10.6.1')
2190         self.report_extraction(mobj.group(1))
2191         info = None
2192         try:
2193             urlh = compat_urllib_request.urlopen(request)
2194             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2195                 basename = url.split('/')[-1]
2196                 title,ext = os.path.splitext(basename)
2197                 title = title.decode('UTF-8')
2198                 ext = ext.replace('.', '')
2199                 self.report_direct_download(title)
2200                 info = {
2201                     'id': title,
2202                     'url': url,
2203                     'uploader': None,
2204                     'upload_date': None,
2205                     'title': title,
2206                     'ext': ext,
2207                     'urlhandle': urlh
2208                 }
2209         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2210             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2211         if info is None: # Regular URL
2212             try:
2213                 json_code_bytes = urlh.read()
2214                 json_code = json_code_bytes.decode('utf-8')
2215             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2216                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2217                 return
2218
2219             try:
2220                 json_data = json.loads(json_code)
2221                 if 'Post' in json_data:
2222                     data = json_data['Post']
2223                 else:
2224                     data = json_data
2225
2226                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2227                 video_url = data['media']['url']
2228                 umobj = re.match(self._URL_EXT, video_url)
2229                 if umobj is None:
2230                     raise ValueError('Can not determine filename extension')
2231                 ext = umobj.group(1)
2232
2233                 info = {
2234                     'id': data['item_id'],
2235                     'url': video_url,
2236                     'uploader': data['display_name'],
2237                     'upload_date': upload_date,
2238                     'title': data['title'],
2239                     'ext': ext,
2240                     'format': data['media']['mimeType'],
2241                     'thumbnail': data['thumbnailUrl'],
2242                     'description': data['description'],
2243                     'player_url': data['embedUrl'],
2244                     'user_agent': 'iTunes/10.6.1',
2245                 }
2246             except (ValueError,KeyError) as err:
2247                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2248                 return
2249
2250         return [info]
2251
2252
2253 class MyVideoIE(InfoExtractor):
2254     """Information Extractor for myvideo.de."""
2255
2256     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2257     IE_NAME = u'myvideo'
2258
2259     def __init__(self, downloader=None):
2260         InfoExtractor.__init__(self, downloader)
2261
2262     def report_extraction(self, video_id):
2263         """Report information extraction."""
2264         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2265
2266     def _real_extract(self,url):
2267         mobj = re.match(self._VALID_URL, url)
2268         if mobj is None:
2269             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2270             return
2271
2272         video_id = mobj.group(1)
2273
2274         # Get video webpage
2275         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2276         webpage = self._download_webpage(webpage_url, video_id)
2277
2278         self.report_extraction(video_id)
2279         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2280                  webpage)
2281         if mobj is None:
2282             self._downloader.trouble(u'ERROR: unable to extract media URL')
2283             return
2284         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2285
2286         mobj = re.search('<title>([^<]+)</title>', webpage)
2287         if mobj is None:
2288             self._downloader.trouble(u'ERROR: unable to extract title')
2289             return
2290
2291         video_title = mobj.group(1)
2292
2293         return [{
2294             'id':       video_id,
2295             'url':      video_url,
2296             'uploader': None,
2297             'upload_date':  None,
2298             'title':    video_title,
2299             'ext':      u'flv',
2300         }]
2301
2302 class ComedyCentralIE(InfoExtractor):
2303     """Information extractor for The Daily Show and Colbert Report """
2304
2305     # urls can be abbreviations like :thedailyshow or :colbert
2306     # urls for episodes like:
2307     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2308     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2309     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2310     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2311                       |(https?://)?(www\.)?
2312                           (?P<showname>thedailyshow|colbertnation)\.com/
2313                          (full-episodes/(?P<episode>.*)|
2314                           (?P<clip>
2315                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2316                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2317                      $"""
2318
2319     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2320
2321     _video_extensions = {
2322         '3500': 'mp4',
2323         '2200': 'mp4',
2324         '1700': 'mp4',
2325         '1200': 'mp4',
2326         '750': 'mp4',
2327         '400': 'mp4',
2328     }
2329     _video_dimensions = {
2330         '3500': '1280x720',
2331         '2200': '960x540',
2332         '1700': '768x432',
2333         '1200': '640x360',
2334         '750': '512x288',
2335         '400': '384x216',
2336     }
2337
2338     @classmethod
2339     def suitable(cls, url):
2340         """Receives a URL and returns True if suitable for this IE."""
2341         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2342
2343     def report_extraction(self, episode_id):
2344         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2345
2346     def report_config_download(self, episode_id, media_id):
2347         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2348
2349     def report_index_download(self, episode_id):
2350         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2351
2352     def _print_formats(self, formats):
2353         print('Available formats:')
2354         for x in formats:
2355             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2356
2357
2358     def _real_extract(self, url):
2359         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2360         if mobj is None:
2361             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2362             return
2363
2364         if mobj.group('shortname'):
2365             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2366                 url = u'http://www.thedailyshow.com/full-episodes/'
2367             else:
2368                 url = u'http://www.colbertnation.com/full-episodes/'
2369             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2370             assert mobj is not None
2371
2372         if mobj.group('clip'):
2373             if mobj.group('showname') == 'thedailyshow':
2374                 epTitle = mobj.group('tdstitle')
2375             else:
2376                 epTitle = mobj.group('cntitle')
2377             dlNewest = False
2378         else:
2379             dlNewest = not mobj.group('episode')
2380             if dlNewest:
2381                 epTitle = mobj.group('showname')
2382             else:
2383                 epTitle = mobj.group('episode')
2384
2385         req = compat_urllib_request.Request(url)
2386         self.report_extraction(epTitle)
2387         try:
2388             htmlHandle = compat_urllib_request.urlopen(req)
2389             html = htmlHandle.read()
2390             webpage = html.decode('utf-8')
2391         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2392             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2393             return
2394         if dlNewest:
2395             url = htmlHandle.geturl()
2396             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2397             if mobj is None:
2398                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2399                 return
2400             if mobj.group('episode') == '':
2401                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2402                 return
2403             epTitle = mobj.group('episode')
2404
2405         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2406
2407         if len(mMovieParams) == 0:
2408             # The Colbert Report embeds the information in a without
2409             # a URL prefix; so extract the alternate reference
2410             # and then add the URL prefix manually.
2411
2412             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2413             if len(altMovieParams) == 0:
2414                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2415                 return
2416             else:
2417                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2418
2419         uri = mMovieParams[0][1]
2420         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2421         self.report_index_download(epTitle)
2422         try:
2423             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2424         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2425             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2426             return
2427
2428         results = []
2429
2430         idoc = xml.etree.ElementTree.fromstring(indexXml)
2431         itemEls = idoc.findall('.//item')
2432         for partNum,itemEl in enumerate(itemEls):
2433             mediaId = itemEl.findall('./guid')[0].text
2434             shortMediaId = mediaId.split(':')[-1]
2435             showId = mediaId.split(':')[-2].replace('.com', '')
2436             officialTitle = itemEl.findall('./title')[0].text
2437             officialDate = itemEl.findall('./pubDate')[0].text
2438
2439             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2440                         compat_urllib_parse.urlencode({'uri': mediaId}))
2441             configReq = compat_urllib_request.Request(configUrl)
2442             self.report_config_download(epTitle, shortMediaId)
2443             try:
2444                 configXml = compat_urllib_request.urlopen(configReq).read()
2445             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2447                 return
2448
2449             cdoc = xml.etree.ElementTree.fromstring(configXml)
2450             turls = []
2451             for rendition in cdoc.findall('.//rendition'):
2452                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2453                 turls.append(finfo)
2454
2455             if len(turls) == 0:
2456                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2457                 continue
2458
2459             if self._downloader.params.get('listformats', None):
2460                 self._print_formats([i[0] for i in turls])
2461                 return
2462
2463             # For now, just pick the highest bitrate
2464             format,rtmp_video_url = turls[-1]
2465
2466             # Get the format arg from the arg stream
2467             req_format = self._downloader.params.get('format', None)
2468
2469             # Select format if we can find one
2470             for f,v in turls:
2471                 if f == req_format:
2472                     format, rtmp_video_url = f, v
2473                     break
2474
2475             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2476             if not m:
2477                 raise ExtractorError(u'Cannot transform RTMP url')
2478             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2479             video_url = base + m.group('finalid')
2480
2481             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2482             info = {
2483                 'id': shortMediaId,
2484                 'url': video_url,
2485                 'uploader': showId,
2486                 'upload_date': officialDate,
2487                 'title': effTitle,
2488                 'ext': 'mp4',
2489                 'format': format,
2490                 'thumbnail': None,
2491                 'description': officialTitle,
2492             }
2493             results.append(info)
2494
2495         return results
2496
2497
2498 class EscapistIE(InfoExtractor):
2499     """Information extractor for The Escapist """
2500
2501     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2502     IE_NAME = u'escapist'
2503
2504     def report_extraction(self, showName):
2505         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2506
2507     def report_config_download(self, showName):
2508         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2509
2510     def _real_extract(self, url):
2511         mobj = re.match(self._VALID_URL, url)
2512         if mobj is None:
2513             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2514             return
2515         showName = mobj.group('showname')
2516         videoId = mobj.group('episode')
2517
2518         self.report_extraction(showName)
2519         try:
2520             webPage = compat_urllib_request.urlopen(url)
2521             webPageBytes = webPage.read()
2522             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2523             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2524         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2526             return
2527
2528         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2529         description = unescapeHTML(descMatch.group(1))
2530         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2531         imgUrl = unescapeHTML(imgMatch.group(1))
2532         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2533         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2534         configUrlMatch = re.search('config=(.*)$', playerUrl)
2535         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2536
2537         self.report_config_download(showName)
2538         try:
2539             configJSON = compat_urllib_request.urlopen(configUrl)
2540             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2541             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2543             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2544             return
2545
2546         # Technically, it's JavaScript, not JSON
2547         configJSON = configJSON.replace("'", '"')
2548
2549         try:
2550             config = json.loads(configJSON)
2551         except (ValueError,) as err:
2552             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2553             return
2554
2555         playlist = config['playlist']
2556         videoUrl = playlist[1]['url']
2557
2558         info = {
2559             'id': videoId,
2560             'url': videoUrl,
2561             'uploader': showName,
2562             'upload_date': None,
2563             'title': showName,
2564             'ext': 'flv',
2565             'thumbnail': imgUrl,
2566             'description': description,
2567             'player_url': playerUrl,
2568         }
2569
2570         return [info]
2571
2572 class CollegeHumorIE(InfoExtractor):
2573     """Information extractor for collegehumor.com"""
2574
2575     _WORKING = False
2576     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2577     IE_NAME = u'collegehumor'
2578
2579     def report_manifest(self, video_id):
2580         """Report information extraction."""
2581         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2582
2583     def report_extraction(self, video_id):
2584         """Report information extraction."""
2585         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2586
2587     def _real_extract(self, url):
2588         mobj = re.match(self._VALID_URL, url)
2589         if mobj is None:
2590             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2591             return
2592         video_id = mobj.group('videoid')
2593
2594         info = {
2595             'id': video_id,
2596             'uploader': None,
2597             'upload_date': None,
2598         }
2599
2600         self.report_extraction(video_id)
2601         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2602         try:
2603             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2604         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2606             return
2607
2608         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2609         try:
2610             videoNode = mdoc.findall('./video')[0]
2611             info['description'] = videoNode.findall('./description')[0].text
2612             info['title'] = videoNode.findall('./caption')[0].text
2613             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2614             manifest_url = videoNode.findall('./file')[0].text
2615         except IndexError:
2616             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2617             return
2618
2619         manifest_url += '?hdcore=2.10.3'
2620         self.report_manifest(video_id)
2621         try:
2622             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2623         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2624             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2625             return
2626
2627         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2628         try:
2629             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2630             node_id = media_node.attrib['url']
2631             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2632         except IndexError as err:
2633             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2634             return
2635
2636         url_pr = compat_urllib_parse_urlparse(manifest_url)
2637         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2638
2639         info['url'] = url
2640         info['ext'] = 'f4f'
2641         return [info]
2642
2643
2644 class XVideosIE(InfoExtractor):
2645     """Information extractor for xvideos.com"""
2646
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2648     IE_NAME = u'xvideos'
2649
2650     def report_extraction(self, video_id):
2651         """Report information extraction."""
2652         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2653
2654     def _real_extract(self, url):
2655         mobj = re.match(self._VALID_URL, url)
2656         if mobj is None:
2657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2658             return
2659         video_id = mobj.group(1)
2660
2661         webpage = self._download_webpage(url, video_id)
2662
2663         self.report_extraction(video_id)
2664
2665
2666         # Extract video URL
2667         mobj = re.search(r'flv_url=(.+?)&', webpage)
2668         if mobj is None:
2669             self._downloader.trouble(u'ERROR: unable to extract video url')
2670             return
2671         video_url = compat_urllib_parse.unquote(mobj.group(1))
2672
2673
2674         # Extract title
2675         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2676         if mobj is None:
2677             self._downloader.trouble(u'ERROR: unable to extract video title')
2678             return
2679         video_title = mobj.group(1)
2680
2681
2682         # Extract video thumbnail
2683         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2684         if mobj is None:
2685             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2686             return
2687         video_thumbnail = mobj.group(0)
2688
2689         info = {
2690             'id': video_id,
2691             'url': video_url,
2692             'uploader': None,
2693             'upload_date': None,
2694             'title': video_title,
2695             'ext': 'flv',
2696             'thumbnail': video_thumbnail,
2697             'description': None,
2698         }
2699
2700         return [info]
2701
2702
2703 class SoundcloudIE(InfoExtractor):
2704     """Information extractor for soundcloud.com
2705        To access the media, the uid of the song and a stream token
2706        must be extracted from the page source and the script must make
2707        a request to media.soundcloud.com/crossdomain.xml. Then
2708        the media can be grabbed by requesting from an url composed
2709        of the stream token and uid
2710      """
2711
2712     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2713     IE_NAME = u'soundcloud'
2714
2715     def __init__(self, downloader=None):
2716         InfoExtractor.__init__(self, downloader)
2717
2718     def report_resolve(self, video_id):
2719         """Report information extraction."""
2720         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2721
2722     def report_extraction(self, video_id):
2723         """Report information extraction."""
2724         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2725
2726     def _real_extract(self, url):
2727         mobj = re.match(self._VALID_URL, url)
2728         if mobj is None:
2729             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2730             return
2731
2732         # extract uploader (which is in the url)
2733         uploader = mobj.group(1)
2734         # extract simple title (uploader + slug of song title)
2735         slug_title =  mobj.group(2)
2736         simple_title = uploader + u'-' + slug_title
2737
2738         self.report_resolve('%s/%s' % (uploader, slug_title))
2739
2740         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2741         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2742         request = compat_urllib_request.Request(resolv_url)
2743         try:
2744             info_json_bytes = compat_urllib_request.urlopen(request).read()
2745             info_json = info_json_bytes.decode('utf-8')
2746         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2748             return
2749
2750         info = json.loads(info_json)
2751         video_id = info['id']
2752         self.report_extraction('%s/%s' % (uploader, slug_title))
2753
2754         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2755         request = compat_urllib_request.Request(streams_url)
2756         try:
2757             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2758             stream_json = stream_json_bytes.decode('utf-8')
2759         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2760             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2761             return
2762
2763         streams = json.loads(stream_json)
2764         mediaURL = streams['http_mp3_128_url']
2765
2766         return [{
2767             'id':       info['id'],
2768             'url':      mediaURL,
2769             'uploader': info['user']['username'],
2770             'upload_date':  info['created_at'],
2771             'title':    info['title'],
2772             'ext':      u'mp3',
2773             'description': info['description'],
2774         }]
2775
2776
2777 class InfoQIE(InfoExtractor):
2778     """Information extractor for infoq.com"""
2779     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2780
2781     def report_extraction(self, video_id):
2782         """Report information extraction."""
2783         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2784
2785     def _real_extract(self, url):
2786         mobj = re.match(self._VALID_URL, url)
2787         if mobj is None:
2788             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2789             return
2790
2791         webpage = self._download_webpage(url, video_id=url)
2792         self.report_extraction(url)
2793
2794         # Extract video URL
2795         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2796         if mobj is None:
2797             self._downloader.trouble(u'ERROR: unable to extract video url')
2798             return
2799         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2800         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2801
2802         # Extract title
2803         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2804         if mobj is None:
2805             self._downloader.trouble(u'ERROR: unable to extract video title')
2806             return
2807         video_title = mobj.group(1)
2808
2809         # Extract description
2810         video_description = u'No description available.'
2811         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2812         if mobj is not None:
2813             video_description = mobj.group(1)
2814
2815         video_filename = video_url.split('/')[-1]
2816         video_id, extension = video_filename.split('.')
2817
2818         info = {
2819             'id': video_id,
2820             'url': video_url,
2821             'uploader': None,
2822             'upload_date': None,
2823             'title': video_title,
2824             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2825             'thumbnail': None,
2826             'description': video_description,
2827         }
2828
2829         return [info]
2830
2831 class MixcloudIE(InfoExtractor):
2832     """Information extractor for www.mixcloud.com"""
2833
2834     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2835     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2836     IE_NAME = u'mixcloud'
2837
2838     def __init__(self, downloader=None):
2839         InfoExtractor.__init__(self, downloader)
2840
2841     def report_download_json(self, file_id):
2842         """Report JSON download."""
2843         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2844
2845     def report_extraction(self, file_id):
2846         """Report information extraction."""
2847         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2848
2849     def get_urls(self, jsonData, fmt, bitrate='best'):
2850         """Get urls from 'audio_formats' section in json"""
2851         file_url = None
2852         try:
2853             bitrate_list = jsonData[fmt]
2854             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2855                 bitrate = max(bitrate_list) # select highest
2856
2857             url_list = jsonData[fmt][bitrate]
2858         except TypeError: # we have no bitrate info.
2859             url_list = jsonData[fmt]
2860         return url_list
2861
2862     def check_urls(self, url_list):
2863         """Returns 1st active url from list"""
2864         for url in url_list:
2865             try:
2866                 compat_urllib_request.urlopen(url)
2867                 return url
2868             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869                 url = None
2870
2871         return None
2872
2873     def _print_formats(self, formats):
2874         print('Available formats:')
2875         for fmt in formats.keys():
2876             for b in formats[fmt]:
2877                 try:
2878                     ext = formats[fmt][b][0]
2879                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2880                 except TypeError: # we have no bitrate info
2881                     ext = formats[fmt][0]
2882                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2883                     break
2884
2885     def _real_extract(self, url):
2886         mobj = re.match(self._VALID_URL, url)
2887         if mobj is None:
2888             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2889             return
2890         # extract uploader & filename from url
2891         uploader = mobj.group(1).decode('utf-8')
2892         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2893
2894         # construct API request
2895         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2896         # retrieve .json file with links to files
2897         request = compat_urllib_request.Request(file_url)
2898         try:
2899             self.report_download_json(file_url)
2900             jsonData = compat_urllib_request.urlopen(request).read()
2901         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2902             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2903             return
2904
2905         # parse JSON
2906         json_data = json.loads(jsonData)
2907         player_url = json_data['player_swf_url']
2908         formats = dict(json_data['audio_formats'])
2909
2910         req_format = self._downloader.params.get('format', None)
2911         bitrate = None
2912
2913         if self._downloader.params.get('listformats', None):
2914             self._print_formats(formats)
2915             return
2916
2917         if req_format is None or req_format == 'best':
2918             for format_param in formats.keys():
2919                 url_list = self.get_urls(formats, format_param)
2920                 # check urls
2921                 file_url = self.check_urls(url_list)
2922                 if file_url is not None:
2923                     break # got it!
2924         else:
2925             if req_format not in formats:
2926                 self._downloader.trouble(u'ERROR: format is not available')
2927                 return
2928
2929             url_list = self.get_urls(formats, req_format)
2930             file_url = self.check_urls(url_list)
2931             format_param = req_format
2932
2933         return [{
2934             'id': file_id.decode('utf-8'),
2935             'url': file_url.decode('utf-8'),
2936             'uploader': uploader.decode('utf-8'),
2937             'upload_date': None,
2938             'title': json_data['name'],
2939             'ext': file_url.split('.')[-1].decode('utf-8'),
2940             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2941             'thumbnail': json_data['thumbnail_url'],
2942             'description': json_data['description'],
2943             'player_url': player_url.decode('utf-8'),
2944         }]
2945
2946 class StanfordOpenClassroomIE(InfoExtractor):
2947     """Information extractor for Stanford's Open ClassRoom"""
2948
2949     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2950     IE_NAME = u'stanfordoc'
2951
2952     def report_download_webpage(self, objid):
2953         """Report information extraction."""
2954         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2955
2956     def report_extraction(self, video_id):
2957         """Report information extraction."""
2958         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2959
2960     def _real_extract(self, url):
2961         mobj = re.match(self._VALID_URL, url)
2962         if mobj is None:
2963             raise ExtractorError(u'Invalid URL: %s' % url)
2964
2965         if mobj.group('course') and mobj.group('video'): # A specific video
2966             course = mobj.group('course')
2967             video = mobj.group('video')
2968             info = {
2969                 'id': course + '_' + video,
2970                 'uploader': None,
2971                 'upload_date': None,
2972             }
2973
2974             self.report_extraction(info['id'])
2975             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2976             xmlUrl = baseUrl + video + '.xml'
2977             try:
2978                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2979             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2980                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2981                 return
2982             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2983             try:
2984                 info['title'] = mdoc.findall('./title')[0].text
2985                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2986             except IndexError:
2987                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2988                 return
2989             info['ext'] = info['url'].rpartition('.')[2]
2990             return [info]
2991         elif mobj.group('course'): # A course page
2992             course = mobj.group('course')
2993             info = {
2994                 'id': course,
2995                 'type': 'playlist',
2996                 'uploader': None,
2997                 'upload_date': None,
2998             }
2999
3000             coursepage = self._download_webpage(url, info['id'],
3001                                         note='Downloading course info page',
3002                                         errnote='Unable to download course info page')
3003
3004             m = re.search('<h1>([^<]+)</h1>', coursepage)
3005             if m:
3006                 info['title'] = unescapeHTML(m.group(1))
3007             else:
3008                 info['title'] = info['id']
3009
3010             m = re.search('<description>([^<]+)</description>', coursepage)
3011             if m:
3012                 info['description'] = unescapeHTML(m.group(1))
3013
3014             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3015             info['list'] = [
3016                 {
3017                     'type': 'reference',
3018                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3019                 }
3020                     for vpage in links]
3021             results = []
3022             for entry in info['list']:
3023                 assert entry['type'] == 'reference'
3024                 results += self.extract(entry['url'])
3025             return results
3026         else: # Root page
3027             info = {
3028                 'id': 'Stanford OpenClassroom',
3029                 'type': 'playlist',
3030                 'uploader': None,
3031                 'upload_date': None,
3032             }
3033
3034             self.report_download_webpage(info['id'])
3035             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3036             try:
3037                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3038             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3039                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3040                 return
3041
3042             info['title'] = info['id']
3043
3044             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3045             info['list'] = [
3046                 {
3047                     'type': 'reference',
3048                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3049                 }
3050                     for cpage in links]
3051
3052             results = []
3053             for entry in info['list']:
3054                 assert entry['type'] == 'reference'
3055                 results += self.extract(entry['url'])
3056             return results
3057
3058 class MTVIE(InfoExtractor):
3059     """Information extractor for MTV.com"""
3060
3061     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3062     IE_NAME = u'mtv'
3063
3064     def report_extraction(self, video_id):
3065         """Report information extraction."""
3066         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3067
3068     def _real_extract(self, url):
3069         mobj = re.match(self._VALID_URL, url)
3070         if mobj is None:
3071             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3072             return
3073         if not mobj.group('proto'):
3074             url = 'http://' + url
3075         video_id = mobj.group('videoid')
3076
3077         webpage = self._download_webpage(url, video_id)
3078
3079         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3080         if mobj is None:
3081             self._downloader.trouble(u'ERROR: unable to extract song name')
3082             return
3083         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3084         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3085         if mobj is None:
3086             self._downloader.trouble(u'ERROR: unable to extract performer')
3087             return
3088         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3089         video_title = performer + ' - ' + song_name
3090
3091         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3092         if mobj is None:
3093             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3094             return
3095         mtvn_uri = mobj.group(1)
3096
3097         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3098         if mobj is None:
3099             self._downloader.trouble(u'ERROR: unable to extract content id')
3100             return
3101         content_id = mobj.group(1)
3102
3103         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3104         self.report_extraction(video_id)
3105         request = compat_urllib_request.Request(videogen_url)
3106         try:
3107             metadataXml = compat_urllib_request.urlopen(request).read()
3108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3109             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3110             return
3111
3112         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3113         renditions = mdoc.findall('.//rendition')
3114
3115         # For now, always pick the highest quality.
3116         rendition = renditions[-1]
3117
3118         try:
3119             _,_,ext = rendition.attrib['type'].partition('/')
3120             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3121             video_url = rendition.find('./src').text
3122         except KeyError:
3123             self._downloader.trouble('Invalid rendition field.')
3124             return
3125
3126         info = {
3127             'id': video_id,
3128             'url': video_url,
3129             'uploader': performer,
3130             'upload_date': None,
3131             'title': video_title,
3132             'ext': ext,
3133             'format': format,
3134         }
3135
3136         return [info]
3137
3138
3139 class YoukuIE(InfoExtractor):
3140     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3141
3142     def report_download_webpage(self, file_id):
3143         """Report webpage download."""
3144         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3145
3146     def report_extraction(self, file_id):
3147         """Report information extraction."""
3148         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3149
3150     def _gen_sid(self):
3151         nowTime = int(time.time() * 1000)
3152         random1 = random.randint(1000,1998)
3153         random2 = random.randint(1000,9999)
3154
3155         return "%d%d%d" %(nowTime,random1,random2)
3156
3157     def _get_file_ID_mix_string(self, seed):
3158         mixed = []
3159         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3160         seed = float(seed)
3161         for i in range(len(source)):
3162             seed  =  (seed * 211 + 30031 ) % 65536
3163             index  =  math.floor(seed / 65536 * len(source) )
3164             mixed.append(source[int(index)])
3165             source.remove(source[int(index)])
3166         #return ''.join(mixed)
3167         return mixed
3168
3169     def _get_file_id(self, fileId, seed):
3170         mixed = self._get_file_ID_mix_string(seed)
3171         ids = fileId.split('*')
3172         realId = []
3173         for ch in ids:
3174             if ch:
3175                 realId.append(mixed[int(ch)])
3176         return ''.join(realId)
3177
3178     def _real_extract(self, url):
3179         mobj = re.match(self._VALID_URL, url)
3180         if mobj is None:
3181             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3182             return
3183         video_id = mobj.group('ID')
3184
3185         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3186
3187         request = compat_urllib_request.Request(info_url, None, std_headers)
3188         try:
3189             self.report_download_webpage(video_id)
3190             jsondata = compat_urllib_request.urlopen(request).read()
3191         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3192             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3193             return
3194
3195         self.report_extraction(video_id)
3196         try:
3197             jsonstr = jsondata.decode('utf-8')
3198             config = json.loads(jsonstr)
3199
3200             video_title =  config['data'][0]['title']
3201             seed = config['data'][0]['seed']
3202
3203             format = self._downloader.params.get('format', None)
3204             supported_format = list(config['data'][0]['streamfileids'].keys())
3205
3206             if format is None or format == 'best':
3207                 if 'hd2' in supported_format:
3208                     format = 'hd2'
3209                 else:
3210                     format = 'flv'
3211                 ext = u'flv'
3212             elif format == 'worst':
3213                 format = 'mp4'
3214                 ext = u'mp4'
3215             else:
3216                 format = 'flv'
3217                 ext = u'flv'
3218
3219
3220             fileid = config['data'][0]['streamfileids'][format]
3221             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3222         except (UnicodeDecodeError, ValueError, KeyError):
3223             self._downloader.trouble(u'ERROR: unable to extract info section')
3224             return
3225
3226         files_info=[]
3227         sid = self._gen_sid()
3228         fileid = self._get_file_id(fileid, seed)
3229
3230         #column 8,9 of fileid represent the segment number
3231         #fileid[7:9] should be changed
3232         for index, key in enumerate(keys):
3233
3234             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3235             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3236
3237             info = {
3238                 'id': '%s_part%02d' % (video_id, index),
3239                 'url': download_url,
3240                 'uploader': None,
3241                 'upload_date': None,
3242                 'title': video_title,
3243                 'ext': ext,
3244             }
3245             files_info.append(info)
3246
3247         return files_info
3248
3249
3250 class XNXXIE(InfoExtractor):
3251     """Information extractor for xnxx.com"""
3252
3253     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3254     IE_NAME = u'xnxx'
3255     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3256     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3257     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3258
3259     def report_webpage(self, video_id):
3260         """Report information extraction"""
3261         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3262
3263     def report_extraction(self, video_id):
3264         """Report information extraction"""
3265         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3266
3267     def _real_extract(self, url):
3268         mobj = re.match(self._VALID_URL, url)
3269         if mobj is None:
3270             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3271             return
3272         video_id = mobj.group(1)
3273
3274         self.report_webpage(video_id)
3275
3276         # Get webpage content
3277         try:
3278             webpage_bytes = compat_urllib_request.urlopen(url).read()
3279             webpage = webpage_bytes.decode('utf-8')
3280         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3281             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3282             return
3283
3284         result = re.search(self.VIDEO_URL_RE, webpage)
3285         if result is None:
3286             self._downloader.trouble(u'ERROR: unable to extract video url')
3287             return
3288         video_url = compat_urllib_parse.unquote(result.group(1))
3289
3290         result = re.search(self.VIDEO_TITLE_RE, webpage)
3291         if result is None:
3292             self._downloader.trouble(u'ERROR: unable to extract video title')
3293             return
3294         video_title = result.group(1)
3295
3296         result = re.search(self.VIDEO_THUMB_RE, webpage)
3297         if result is None:
3298             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3299             return
3300         video_thumbnail = result.group(1)
3301
3302         return [{
3303             'id': video_id,
3304             'url': video_url,
3305             'uploader': None,
3306             'upload_date': None,
3307             'title': video_title,
3308             'ext': 'flv',
3309             'thumbnail': video_thumbnail,
3310             'description': None,
3311         }]
3312
3313
3314 class GooglePlusIE(InfoExtractor):
3315     """Information extractor for plus.google.com."""
3316
3317     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3318     IE_NAME = u'plus.google'
3319
3320     def __init__(self, downloader=None):
3321         InfoExtractor.__init__(self, downloader)
3322
3323     def report_extract_entry(self, url):
3324         """Report downloading extry"""
3325         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3326
3327     def report_date(self, upload_date):
3328         """Report downloading extry"""
3329         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3330
3331     def report_uploader(self, uploader):
3332         """Report downloading extry"""
3333         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3334
3335     def report_title(self, video_title):
3336         """Report downloading extry"""
3337         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3338
3339     def report_extract_vid_page(self, video_page):
3340         """Report information extraction."""
3341         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3342
3343     def _real_extract(self, url):
3344         # Extract id from URL
3345         mobj = re.match(self._VALID_URL, url)
3346         if mobj is None:
3347             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3348             return
3349
3350         post_url = mobj.group(0)
3351         video_id = mobj.group(1)
3352
3353         video_extension = 'flv'
3354
3355         # Step 1, Retrieve post webpage to extract further information
3356         self.report_extract_entry(post_url)
3357         request = compat_urllib_request.Request(post_url)
3358         try:
3359             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3360         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3361             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3362             return
3363
3364         # Extract update date
3365         upload_date = None
3366         pattern = 'title="Timestamp">(.*?)</a>'
3367         mobj = re.search(pattern, webpage)
3368         if mobj:
3369             upload_date = mobj.group(1)
3370             # Convert timestring to a format suitable for filename
3371             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3372             upload_date = upload_date.strftime('%Y%m%d')
3373         self.report_date(upload_date)
3374
3375         # Extract uploader
3376         uploader = None
3377         pattern = r'rel\="author".*?>(.*?)</a>'
3378         mobj = re.search(pattern, webpage)
3379         if mobj:
3380             uploader = mobj.group(1)
3381         self.report_uploader(uploader)
3382
3383         # Extract title
3384         # Get the first line for title
3385         video_title = u'NA'
3386         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3387         mobj = re.search(pattern, webpage)
3388         if mobj:
3389             video_title = mobj.group(1)
3390         self.report_title(video_title)
3391
3392         # Step 2, Stimulate clicking the image box to launch video
3393         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3394         mobj = re.search(pattern, webpage)
3395         if mobj is None:
3396             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3397
3398         video_page = mobj.group(1)
3399         request = compat_urllib_request.Request(video_page)
3400         try:
3401             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3402         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3403             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3404             return
3405         self.report_extract_vid_page(video_page)
3406
3407
3408         # Extract video links on video page
3409         """Extract video links of all sizes"""
3410         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3411         mobj = re.findall(pattern, webpage)
3412         if len(mobj) == 0:
3413             self._downloader.trouble(u'ERROR: unable to extract video links')
3414
3415         # Sort in resolution
3416         links = sorted(mobj)
3417
3418         # Choose the lowest of the sort, i.e. highest resolution
3419         video_url = links[-1]
3420         # Only get the url. The resolution part in the tuple has no use anymore
3421         video_url = video_url[-1]
3422         # Treat escaped \u0026 style hex
3423         try:
3424             video_url = video_url.decode("unicode_escape")
3425         except AttributeError: # Python 3
3426             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3427
3428
3429         return [{
3430             'id':       video_id,
3431             'url':      video_url,
3432             'uploader': uploader,
3433             'upload_date':  upload_date,
3434             'title':    video_title,
3435             'ext':      video_extension,
3436         }]
3437
3438 class NBAIE(InfoExtractor):
3439     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3440     IE_NAME = u'nba'
3441
3442     def _real_extract(self, url):
3443         mobj = re.match(self._VALID_URL, url)
3444         if mobj is None:
3445             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3446             return
3447
3448         video_id = mobj.group(1)
3449         if video_id.endswith('/index.html'):
3450             video_id = video_id[:-len('/index.html')]
3451
3452         webpage = self._download_webpage(url, video_id)
3453
3454         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3455         def _findProp(rexp, default=None):
3456             m = re.search(rexp, webpage)
3457             if m:
3458                 return unescapeHTML(m.group(1))
3459             else:
3460                 return default
3461
3462         shortened_video_id = video_id.rpartition('/')[2]
3463         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3464         info = {
3465             'id': shortened_video_id,
3466             'url': video_url,
3467             'ext': 'mp4',
3468             'title': title,
3469             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3470             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3471         }
3472         return [info]
3473
3474 class JustinTVIE(InfoExtractor):
3475     """Information extractor for justin.tv and twitch.tv"""
3476     # TODO: One broadcast may be split into multiple videos. The key
3477     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3478     # starts at 1 and increases. Can we treat all parts as one video?
3479
3480     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3481         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3482     _JUSTIN_PAGE_LIMIT = 100
3483     IE_NAME = u'justin.tv'
3484
3485     def report_extraction(self, file_id):
3486         """Report information extraction."""
3487         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3488
3489     def report_download_page(self, channel, offset):
3490         """Report attempt to download a single page of videos."""
3491         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3492                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3493
3494     # Return count of items, list of *valid* items
3495     def _parse_page(self, url):
3496         try:
3497             urlh = compat_urllib_request.urlopen(url)
3498             webpage_bytes = urlh.read()
3499             webpage = webpage_bytes.decode('utf-8', 'ignore')
3500         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3501             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3502             return
3503
3504         response = json.loads(webpage)
3505         if type(response) != list:
3506             error_text = response.get('error', 'unknown error')
3507             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3508             return
3509         info = []
3510         for clip in response:
3511             video_url = clip['video_file_url']
3512             if video_url:
3513                 video_extension = os.path.splitext(video_url)[1][1:]
3514                 video_date = re.sub('-', '', clip['start_time'][:10])
3515                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3516                 video_id = clip['id']
3517                 video_title = clip.get('title', video_id)
3518                 info.append({
3519                     'id': video_id,
3520                     'url': video_url,
3521                     'title': video_title,
3522                     'uploader': clip.get('channel_name', video_uploader_id),
3523                     'uploader_id': video_uploader_id,
3524                     'upload_date': video_date,
3525                     'ext': video_extension,
3526                 })
3527         return (len(response), info)
3528
3529     def _real_extract(self, url):
3530         mobj = re.match(self._VALID_URL, url)
3531         if mobj is None:
3532             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3533             return
3534
3535         api = 'http://api.justin.tv'
3536         video_id = mobj.group(mobj.lastindex)
3537         paged = False
3538         if mobj.lastindex == 1:
3539             paged = True
3540             api += '/channel/archives/%s.json'
3541         else:
3542             api += '/broadcast/by_archive/%s.json'
3543         api = api % (video_id,)
3544
3545         self.report_extraction(video_id)
3546
3547         info = []
3548         offset = 0
3549         limit = self._JUSTIN_PAGE_LIMIT
3550         while True:
3551             if paged:
3552                 self.report_download_page(video_id, offset)
3553             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3554             page_count, page_info = self._parse_page(page_url)
3555             info.extend(page_info)
3556             if not paged or page_count != limit:
3557                 break
3558             offset += limit
3559         return info
3560
3561 class FunnyOrDieIE(InfoExtractor):
3562     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3563
3564     def _real_extract(self, url):
3565         mobj = re.match(self._VALID_URL, url)
3566         if mobj is None:
3567             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568             return
3569
3570         video_id = mobj.group('id')
3571         webpage = self._download_webpage(url, video_id)
3572
3573         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3574         if not m:
3575             self._downloader.trouble(u'ERROR: unable to find video information')
3576         video_url = unescapeHTML(m.group('url'))
3577
3578         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3579         if not m:
3580             self._downloader.trouble(u'Cannot find video title')
3581         title = unescapeHTML(m.group('title'))
3582
3583         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3584         if m:
3585             desc = unescapeHTML(m.group('desc'))
3586         else:
3587             desc = None
3588
3589         info = {
3590             'id': video_id,
3591             'url': video_url,
3592             'ext': 'mp4',
3593             'title': title,
3594             'description': desc,
3595         }
3596         return [info]
3597
3598 class TweetReelIE(InfoExtractor):
3599     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3600
3601     def _real_extract(self, url):
3602         mobj = re.match(self._VALID_URL, url)
3603         if mobj is None:
3604             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3605             return
3606
3607         video_id = mobj.group('id')
3608         webpage = self._download_webpage(url, video_id)
3609
3610         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3611         if not m:
3612             self._downloader.trouble(u'ERROR: Cannot find status ID')
3613         status_id = m.group(1)
3614
3615         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3616         if not m:
3617             self._downloader.trouble(u'WARNING: Cannot find description')
3618         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3619
3620         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3621         if not m:
3622             self._downloader.trouble(u'ERROR: Cannot find uploader')
3623         uploader = unescapeHTML(m.group('uploader'))
3624         uploader_id = unescapeHTML(m.group('uploader_id'))
3625
3626         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3627         if not m:
3628             self._downloader.trouble(u'ERROR: Cannot find upload date')
3629         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3630
3631         title = desc
3632         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3633
3634         info = {
3635             'id': video_id,
3636             'url': video_url,
3637             'ext': 'mov',
3638             'title': title,
3639             'description': desc,
3640             'uploader': uploader,
3641             'uploader_id': uploader_id,
3642             'internal_id': status_id,
3643             'upload_date': upload_date
3644         }
3645         return [info]
3646
3647 class SteamIE(InfoExtractor):
3648     _VALID_URL = r"""http://store.steampowered.com/
3649                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3650                 (?P<gameID>\d+)/?
3651                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3652                 """
3653
3654     @classmethod
3655     def suitable(cls, url):
3656         """Receives a URL and returns True if suitable for this IE."""
3657         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3658
3659     def _real_extract(self, url):
3660         m = re.match(self._VALID_URL, url, re.VERBOSE)
3661         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3662         gameID = m.group('gameID')
3663         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3664         webpage = self._download_webpage(videourl, gameID)
3665         mweb = re.finditer(urlRE, webpage)
3666         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3667         titles = re.finditer(namesRE, webpage)
3668         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3669         thumbs = re.finditer(thumbsRE, webpage)
3670         videos = []
3671         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3672             video_id = vid.group('videoID')
3673             title = vtitle.group('videoName')
3674             video_url = vid.group('videoURL')
3675             video_thumb = thumb.group('thumbnail')
3676             if not video_url:
3677                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3678             info = {
3679                 'id':video_id,
3680                 'url':video_url,
3681                 'ext': 'flv',
3682                 'title': unescapeHTML(title),
3683                 'thumbnail': video_thumb
3684                   }
3685             videos.append(info)
3686         return videos
3687
3688 class UstreamIE(InfoExtractor):
3689     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3690     IE_NAME = u'ustream'
3691
3692     def _real_extract(self, url):
3693         m = re.match(self._VALID_URL, url)
3694         video_id = m.group('videoID')
3695         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3696         webpage = self._download_webpage(url, video_id)
3697         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3698         title = m.group('title')
3699         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3700         uploader = m.group('uploader')
3701         info = {
3702                 'id':video_id,
3703                 'url':video_url,
3704                 'ext': 'flv',
3705                 'title': title,
3706                 'uploader': uploader
3707                   }
3708         return [info]
3709
3710 class RBMARadioIE(InfoExtractor):
3711     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3712
3713     def _real_extract(self, url):
3714         m = re.match(self._VALID_URL, url)
3715         video_id = m.group('videoID')
3716
3717         webpage = self._download_webpage(url, video_id)
3718         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3719         if not m:
3720             raise ExtractorError(u'Cannot find metadata')
3721         json_data = m.group(1)
3722
3723         try:
3724             data = json.loads(json_data)
3725         except ValueError as e:
3726             raise ExtractorError(u'Invalid JSON: ' + str(e))
3727
3728         video_url = data['akamai_url'] + '&cbr=256'
3729         url_parts = compat_urllib_parse_urlparse(video_url)
3730         video_ext = url_parts.path.rpartition('.')[2]
3731         info = {
3732                 'id': video_id,
3733                 'url': video_url,
3734                 'ext': video_ext,
3735                 'title': data['title'],
3736                 'description': data.get('teaser_text'),
3737                 'location': data.get('country_of_origin'),
3738                 'uploader': data.get('host', {}).get('name'),
3739                 'uploader_id': data.get('host', {}).get('slug'),
3740                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3741                 'duration': data.get('duration'),
3742         }
3743         return [info]
3744
3745
3746 class YouPornIE(InfoExtractor):
3747     """Information extractor for youporn.com."""
3748     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3749
3750     def _print_formats(self, formats):
3751         """Print all available formats"""
3752         print(u'Available formats:')
3753         print(u'ext\t\tformat')
3754         print(u'---------------------------------')
3755         for format in formats:
3756             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3757
3758     def _specific(self, req_format, formats):
3759         for x in formats:
3760             if(x["format"]==req_format):
3761                 return x
3762         return None
3763
3764     def _real_extract(self, url):
3765         mobj = re.match(self._VALID_URL, url)
3766         if mobj is None:
3767             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3768             return
3769
3770         video_id = mobj.group('videoid')
3771
3772         req = compat_urllib_request.Request(url)
3773         req.add_header('Cookie', 'age_verified=1')
3774         webpage = self._download_webpage(req, video_id)
3775
3776         # Get the video title
3777         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3778         if result is None:
3779             raise ExtractorError(u'Unable to extract video title')
3780         video_title = result.group('title').strip()
3781
3782         # Get the video date
3783         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3784         if result is None:
3785             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3786             upload_date = None
3787         else:
3788             upload_date = result.group('date').strip()
3789
3790         # Get the video uploader
3791         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3792         if result is None:
3793             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3794             video_uploader = None
3795         else:
3796             video_uploader = result.group('uploader').strip()
3797             video_uploader = clean_html( video_uploader )
3798
3799         # Get all of the formats available
3800         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3801         result = re.search(DOWNLOAD_LIST_RE, webpage)
3802         if result is None:
3803             raise ExtractorError(u'Unable to extract download list')
3804         download_list_html = result.group('download_list').strip()
3805
3806         # Get all of the links from the page
3807         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3808         links = re.findall(LINK_RE, download_list_html)
3809         if(len(links) == 0):
3810             raise ExtractorError(u'ERROR: no known formats available for video')
3811
3812         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3813
3814         formats = []
3815         for link in links:
3816
3817             # A link looks like this:
3818             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3819             # A path looks like this:
3820             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3821             video_url = unescapeHTML( link )
3822             path = compat_urllib_parse_urlparse( video_url ).path
3823             extension = os.path.splitext( path )[1][1:]
3824             format = path.split('/')[4].split('_')[:2]
3825             size = format[0]
3826             bitrate = format[1]
3827             format = "-".join( format )
3828             title = u'%s-%s-%s' % (video_title, size, bitrate)
3829
3830             formats.append({
3831                 'id': video_id,
3832                 'url': video_url,
3833                 'uploader': video_uploader,
3834                 'upload_date': upload_date,
3835                 'title': title,
3836                 'ext': extension,
3837                 'format': format,
3838                 'thumbnail': None,
3839                 'description': None,
3840                 'player_url': None
3841             })
3842
3843         if self._downloader.params.get('listformats', None):
3844             self._print_formats(formats)
3845             return
3846
3847         req_format = self._downloader.params.get('format', None)
3848         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3849
3850         if req_format is None or req_format == 'best':
3851             return [formats[0]]
3852         elif req_format == 'worst':
3853             return [formats[-1]]
3854         elif req_format in ('-1', 'all'):
3855             return formats
3856         else:
3857             format = self._specific( req_format, formats )
3858             if result is None:
3859                 self._downloader.trouble(u'ERROR: requested format not available')
3860                 return
3861             return [format]
3862
3863
3864
3865 class PornotubeIE(InfoExtractor):
3866     """Information extractor for pornotube.com."""
3867     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3868
3869     def _real_extract(self, url):
3870         mobj = re.match(self._VALID_URL, url)
3871         if mobj is None:
3872             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3873             return
3874
3875         video_id = mobj.group('videoid')
3876         video_title = mobj.group('title')
3877
3878         # Get webpage content
3879         webpage = self._download_webpage(url, video_id)
3880
3881         # Get the video URL
3882         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3883         result = re.search(VIDEO_URL_RE, webpage)
3884         if result is None:
3885             self._downloader.trouble(u'ERROR: unable to extract video url')
3886             return
3887         video_url = compat_urllib_parse.unquote(result.group('url'))
3888
3889         #Get the uploaded date
3890         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3891         result = re.search(VIDEO_UPLOADED_RE, webpage)
3892         if result is None:
3893             self._downloader.trouble(u'ERROR: unable to extract video title')
3894             return
3895         upload_date = result.group('date')
3896
3897         info = {'id': video_id,
3898                 'url': video_url,
3899                 'uploader': None,
3900                 'upload_date': upload_date,
3901                 'title': video_title,
3902                 'ext': 'flv',
3903                 'format': 'flv'}
3904
3905         return [info]
3906
3907 class YouJizzIE(InfoExtractor):
3908     """Information extractor for youjizz.com."""
3909     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3910
3911     def _real_extract(self, url):
3912         mobj = re.match(self._VALID_URL, url)
3913         if mobj is None:
3914             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3915             return
3916
3917         video_id = mobj.group('videoid')
3918
3919         # Get webpage content
3920         webpage = self._download_webpage(url, video_id)
3921
3922         # Get the video title
3923         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3924         if result is None:
3925             raise ExtractorError(u'ERROR: unable to extract video title')
3926         video_title = result.group('title').strip()
3927
3928         # Get the embed page
3929         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3930         if result is None:
3931             raise ExtractorError(u'ERROR: unable to extract embed page')
3932
3933         embed_page_url = result.group(0).strip()
3934         video_id = result.group('videoid')
3935
3936         webpage = self._download_webpage(embed_page_url, video_id)
3937
3938         # Get the video URL
3939         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3940         if result is None:
3941             raise ExtractorError(u'ERROR: unable to extract video url')
3942         video_url = result.group('source')
3943
3944         info = {'id': video_id,
3945                 'url': video_url,
3946                 'title': video_title,
3947                 'ext': 'flv',
3948                 'format': 'flv',
3949                 'player_url': embed_page_url}
3950
3951         return [info]
3952
3953 class EightTracksIE(InfoExtractor):
3954     IE_NAME = '8tracks'
3955     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3956
3957     def _real_extract(self, url):
3958         mobj = re.match(self._VALID_URL, url)
3959         if mobj is None:
3960             raise ExtractorError(u'Invalid URL: %s' % url)
3961         playlist_id = mobj.group('id')
3962
3963         webpage = self._download_webpage(url, playlist_id)
3964
3965         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3966         if not m:
3967             raise ExtractorError(u'Cannot find trax information')
3968         json_like = m.group(1)
3969         data = json.loads(json_like)
3970
3971         session = str(random.randint(0, 1000000000))
3972         mix_id = data['id']
3973         track_count = data['tracks_count']
3974         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3975         next_url = first_url
3976         res = []
3977         for i in itertools.count():
3978             api_json = self._download_webpage(next_url, playlist_id,
3979                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3980                 errnote=u'Failed to download song information')
3981             api_data = json.loads(api_json)
3982             track_data = api_data[u'set']['track']
3983             info = {
3984                 'id': track_data['id'],
3985                 'url': track_data['track_file_stream_url'],
3986                 'title': track_data['performer'] + u' - ' + track_data['name'],
3987                 'raw_title': track_data['name'],
3988                 'uploader_id': data['user']['login'],
3989                 'ext': 'm4a',
3990             }
3991             res.append(info)
3992             if api_data['set']['at_last_track']:
3993                 break
3994             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3995         return res
3996
3997 class KeekIE(InfoExtractor):
3998     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3999     IE_NAME = u'keek'
4000
4001     def _real_extract(self, url):
4002         m = re.match(self._VALID_URL, url)
4003         video_id = m.group('videoID')
4004         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4005         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4006         webpage = self._download_webpage(url, video_id)
4007         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4008         title = unescapeHTML(m.group('title'))
4009         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
4010         uploader = unescapeHTML(m.group('uploader'))
4011         info = {
4012                 'id':video_id,
4013                 'url':video_url,
4014                 'ext': 'mp4',
4015                 'title': title,
4016                 'thumbnail': thumbnail,
4017                 'uploader': uploader
4018         }
4019         return [info]
4020
4021 class TEDIE(InfoExtractor):
4022     _VALID_URL=r'''http://www.ted.com/
4023                    (
4024                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4025                         |
4026                         ((?P<type_talk>talks)) # We have a simple talk
4027                    )
4028                    /(?P<name>\w+) # Here goes the name and then ".html"
4029                    '''
4030
4031     @classmethod
4032     def suitable(cls, url):
4033         """Receives a URL and returns True if suitable for this IE."""
4034         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4035
4036     def _real_extract(self, url):
4037         m=re.match(self._VALID_URL, url, re.VERBOSE)
4038         if m.group('type_talk'):
4039             return [self._talk_info(url)]
4040         else :
4041             playlist_id=m.group('playlist_id')
4042             name=m.group('name')
4043             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4044             return self._playlist_videos_info(url,name,playlist_id)
4045
4046     def _talk_video_link(self,mediaSlug):
4047         '''Returns the video link for that mediaSlug'''
4048         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4049
4050     def _playlist_videos_info(self,url,name,playlist_id=0):
4051         '''Returns the videos of the playlist'''
4052         video_RE=r'''
4053                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4054                      ([.\s]*?)data-playlist_item_id="(\d+)"
4055                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4056                      '''
4057         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4058         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4059         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4060         m_names=re.finditer(video_name_RE,webpage)
4061         info=[]
4062         for m_video, m_name in zip(m_videos,m_names):
4063             video_id=m_video.group('video_id')
4064             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4065             info.append(self._talk_info(talk_url,video_id))
4066         return info
4067
4068     def _talk_info(self, url, video_id=0):
4069         """Return the video for the talk in the url"""
4070         m=re.match(self._VALID_URL, url,re.VERBOSE)
4071         videoName=m.group('name')
4072         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4073         # If the url includes the language we get the title translated
4074         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4075         title=re.search(title_RE, webpage).group('title')
4076         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4077                         "id":(?P<videoID>[\d]+).*?
4078                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4079         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4080         thumb_match=re.search(thumb_RE,webpage)
4081         info_match=re.search(info_RE,webpage,re.VERBOSE)
4082         video_id=info_match.group('videoID')
4083         mediaSlug=info_match.group('mediaSlug')
4084         video_url=self._talk_video_link(mediaSlug)
4085         info = {
4086                 'id': video_id,
4087                 'url': video_url,
4088                 'ext': 'mp4',
4089                 'title': title,
4090                 'thumbnail': thumb_match.group('thumbnail')
4091                 }
4092         return info
4093
4094 class MySpassIE(InfoExtractor):
4095     _VALID_URL = r'http://www.myspass.de/.*'
4096
4097     def _real_extract(self, url):
4098         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4099
4100         # video id is the last path element of the URL
4101         # usually there is a trailing slash, so also try the second but last
4102         url_path = compat_urllib_parse_urlparse(url).path
4103         url_parent_path, video_id = os.path.split(url_path)
4104         if not video_id:
4105             _, video_id = os.path.split(url_parent_path)
4106
4107         # get metadata
4108         metadata_url = META_DATA_URL_TEMPLATE % video_id
4109         metadata_text = self._download_webpage(metadata_url, video_id)
4110         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4111
4112         # extract values from metadata
4113         url_flv_el = metadata.find('url_flv')
4114         if url_flv_el is None:
4115             self._downloader.trouble(u'ERROR: unable to extract download url')
4116             return
4117         video_url = url_flv_el.text
4118         extension = os.path.splitext(video_url)[1][1:]
4119         title_el = metadata.find('title')
4120         if title_el is None:
4121             self._downloader.trouble(u'ERROR: unable to extract title')
4122             return
4123         title = title_el.text
4124         format_id_el = metadata.find('format_id')
4125         if format_id_el is None:
4126             format = ext
4127         else:
4128             format = format_id_el.text
4129         description_el = metadata.find('description')
4130         if description_el is not None:
4131             description = description_el.text
4132         else:
4133             description = None
4134         imagePreview_el = metadata.find('imagePreview')
4135         if imagePreview_el is not None:
4136             thumbnail = imagePreview_el.text
4137         else:
4138             thumbnail = None
4139         info = {
4140             'id': video_id,
4141             'url': video_url,
4142             'title': title,
4143             'ext': extension,
4144             'format': format,
4145             'thumbnail': thumbnail,
4146             'description': description
4147         }
4148         return [info]
4149
4150 def gen_extractors():
4151     """ Return a list of an instance of every supported extractor.
4152     The order does matter; the first extractor matched is the one handling the URL.
4153     """
4154     return [
4155         YoutubePlaylistIE(),
4156         YoutubeChannelIE(),
4157         YoutubeUserIE(),
4158         YoutubeSearchIE(),
4159         YoutubeIE(),
4160         MetacafeIE(),
4161         DailymotionIE(),
4162         GoogleSearchIE(),
4163         PhotobucketIE(),
4164         YahooIE(),
4165         YahooSearchIE(),
4166         DepositFilesIE(),
4167         FacebookIE(),
4168         BlipTVUserIE(),
4169         BlipTVIE(),
4170         VimeoIE(),
4171         MyVideoIE(),
4172         ComedyCentralIE(),
4173         EscapistIE(),
4174         CollegeHumorIE(),
4175         XVideosIE(),
4176         SoundcloudIE(),
4177         InfoQIE(),
4178         MixcloudIE(),
4179         StanfordOpenClassroomIE(),
4180         MTVIE(),
4181         YoukuIE(),
4182         XNXXIE(),
4183         YouJizzIE(),
4184         PornotubeIE(),
4185         YouPornIE(),
4186         GooglePlusIE(),
4187         ArteTvIE(),
4188         NBAIE(),
4189         JustinTVIE(),
4190         FunnyOrDieIE(),
4191         TweetReelIE(),
4192         SteamIE(),
4193         UstreamIE(),
4194         RBMARadioIE(),
4195         EightTracksIE(),
4196         KeekIE(),
4197         TEDIE(),
4198         MySpassIE(),
4199         GenericIE()
4200     ]
4201
4202