IEs: clean __init__ methods
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147     def to_screen(self, msg):
148         """Print msg to screen, prefixing it with '[ie_name]'"""
149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151     def report_extraction(self, id_or_name):
152         """Report information extraction."""
153         self.to_screen(u'%s: Extracting information' % id_or_name)
154
155     def report_download_webpage(self, video_id):
156         """Report webpage download."""
157         self.to_screen(u'%s: Downloading webpage' % video_id)
158
159     def report_age_confirmation(self):
160         """Report attempt to confirm age."""
161         self.to_screen(u'Confirming age')
162
163     #Methods for following #608
164     #They set the correct value of the '_type' key
165     def video_result(self, video_info):
166         """Returns a video"""
167         video_info['_type'] = 'video'
168         return video_info
169     def url_result(self, url, ie=None):
170         """Returns a url that points to a page that should be processed"""
171         #TODO: ie should be the class used for getting the info
172         video_info = {'_type': 'url',
173                       'url': url,
174                       'ie_key': ie}
175         return video_info
176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177         """Returns a playlist"""
178         video_info = {'_type': 'playlist',
179                       'entries': entries}
180         if playlist_id:
181             video_info['id'] = playlist_id
182         if playlist_title:
183             video_info['title'] = playlist_title
184         return video_info
185
186
187 class YoutubeIE(InfoExtractor):
188     """Information extractor for youtube.com."""
189
190     _VALID_URL = r"""^
191                      (
192                          (?:https?://)?                                       # http(s):// (optional)
193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
196                          (?:                                                  # the various things that can precede the ID:
197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
198                              |(?:                                             # or the v= param in all its forms
199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
202                                  v=
203                              )
204                          )?                                                   # optional -> youtube.com/xxxx is OK
205                      )?                                                       # all until now is optional -> you can pass the naked ID
206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
207                      (?(1).+)?                                                # if we found the ID, everything can follow
208                      $"""
209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213     _NETRC_MACHINE = 'youtube'
214     # Listed in order of quality
215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217     _video_extensions = {
218         '13': '3gp',
219         '17': 'mp4',
220         '18': 'mp4',
221         '22': 'mp4',
222         '37': 'mp4',
223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
224         '43': 'webm',
225         '44': 'webm',
226         '45': 'webm',
227         '46': 'webm',
228     }
229     _video_dimensions = {
230         '5': '240x400',
231         '6': '???',
232         '13': '???',
233         '17': '144x176',
234         '18': '360x640',
235         '22': '720x1280',
236         '34': '360x640',
237         '35': '480x854',
238         '37': '1080x1920',
239         '38': '3072x4096',
240         '43': '360x640',
241         '44': '480x854',
242         '45': '720x1280',
243         '46': '1080x1920',
244     }
245     IE_NAME = u'youtube'
246
247     @classmethod
248     def suitable(cls, url):
249         """Receives a URL and returns True if suitable for this IE."""
250         if YoutubePlaylistIE.suitable(url): return False
251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
252
253     def report_lang(self):
254         """Report attempt to set language."""
255         self.to_screen(u'Setting language')
256
257     def report_login(self):
258         """Report attempt to log in."""
259         self.to_screen(u'Logging in')
260
261     def report_video_webpage_download(self, video_id):
262         """Report attempt to download video webpage."""
263         self.to_screen(u'%s: Downloading video webpage' % video_id)
264
265     def report_video_info_webpage_download(self, video_id):
266         """Report attempt to download video info webpage."""
267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
268
269     def report_video_subtitles_download(self, video_id):
270         """Report attempt to download video info webpage."""
271         self.to_screen(u'%s: Checking available subtitles' % video_id)
272
273     def report_video_subtitles_request(self, video_id, sub_lang, format):
274         """Report attempt to download video info webpage."""
275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
276
277     def report_video_subtitles_available(self, video_id, sub_lang_list):
278         """Report available subtitles."""
279         sub_lang = ",".join(list(sub_lang_list.keys()))
280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
281
282     def report_information_extraction(self, video_id):
283         """Report attempt to extract video information."""
284         self.to_screen(u'%s: Extracting video information' % video_id)
285
286     def report_unavailable_format(self, video_id, format):
287         """Report extracted video URL."""
288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
289
290     def report_rtmp_download(self):
291         """Indicate the download will use the RTMP protocol."""
292         self.to_screen(u'RTMP download detected')
293
294     def _get_available_subtitles(self, video_id):
295         self.report_video_subtitles_download(video_id)
296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
297         try:
298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303         if not sub_lang_list:
304             return (u'video doesn\'t have subtitles', None)
305         return sub_lang_list
306
307     def _list_available_subtitles(self, video_id):
308         sub_lang_list = self._get_available_subtitles(video_id)
309         self.report_video_subtitles_available(video_id, sub_lang_list)
310
311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
312         """
313         Return tuple:
314         (error_message, sub_lang, sub)
315         """
316         self.report_video_subtitles_request(video_id, sub_lang, format)
317         params = compat_urllib_parse.urlencode({
318             'lang': sub_lang,
319             'name': sub_name,
320             'v': video_id,
321             'fmt': format,
322         })
323         url = 'http://www.youtube.com/api/timedtext?' + params
324         try:
325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
328         if not sub:
329             return (u'Did not fetch video subtitles', None, None)
330         return (None, sub_lang, sub)
331
332     def _extract_subtitle(self, video_id):
333         """
334         Return a list with a tuple:
335         [(error_message, sub_lang, sub)]
336         """
337         sub_lang_list = self._get_available_subtitles(video_id)
338         sub_format = self._downloader.params.get('subtitlesformat')
339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340             return [(sub_lang_list[0], None, None)]
341         if self._downloader.params.get('subtitleslang', False):
342             sub_lang = self._downloader.params.get('subtitleslang')
343         elif 'en' in sub_lang_list:
344             sub_lang = 'en'
345         else:
346             sub_lang = list(sub_lang_list.keys())[0]
347         if not sub_lang in sub_lang_list:
348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
349
350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
351         return [subtitle]
352
353     def _extract_all_subtitles(self, video_id):
354         sub_lang_list = self._get_available_subtitles(video_id)
355         sub_format = self._downloader.params.get('subtitlesformat')
356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357             return [(sub_lang_list[0], None, None)]
358         subtitles = []
359         for sub_lang in sub_lang_list:
360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361             subtitles.append(subtitle)
362         return subtitles
363
364     def _print_formats(self, formats):
365         print('Available formats:')
366         for x in formats:
367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
368
369     def _real_initialize(self):
370         if self._downloader is None:
371             return
372
373         username = None
374         password = None
375         downloader_params = self._downloader.params
376
377         # Attempt to use provided username and password or .netrc data
378         if downloader_params.get('username', None) is not None:
379             username = downloader_params['username']
380             password = downloader_params['password']
381         elif downloader_params.get('usenetrc', False):
382             try:
383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384                 if info is not None:
385                     username = info[0]
386                     password = info[2]
387                 else:
388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389             except (IOError, netrc.NetrcParseError) as err:
390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
391                 return
392
393         # Set language
394         request = compat_urllib_request.Request(self._LANG_URL)
395         try:
396             self.report_lang()
397             compat_urllib_request.urlopen(request).read()
398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
400             return
401
402         # No authentication to be performed
403         if username is None:
404             return
405
406         request = compat_urllib_request.Request(self._LOGIN_URL)
407         try:
408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411             return
412
413         galx = None
414         dsh = None
415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
416         if match:
417           galx = match.group(1)
418
419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
420         if match:
421           dsh = match.group(1)
422
423         # Log in
424         login_form_strs = {
425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
426                 u'Email': username,
427                 u'GALX': galx,
428                 u'Passwd': password,
429                 u'PersistentCookie': u'yes',
430                 u'_utf8': u'霱',
431                 u'bgresponse': u'js_disabled',
432                 u'checkConnection': u'',
433                 u'checkedDomains': u'youtube',
434                 u'dnConn': u'',
435                 u'dsh': dsh,
436                 u'pstMsg': u'0',
437                 u'rmShown': u'1',
438                 u'secTok': u'',
439                 u'signIn': u'Sign in',
440                 u'timeStmp': u'',
441                 u'service': u'youtube',
442                 u'uilel': u'3',
443                 u'hl': u'en_US',
444         }
445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
446         # chokes on unicode
447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
450         try:
451             self.report_login()
452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454                 self._downloader.report_warning(u'unable to log in: bad username or password')
455                 return
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
458             return
459
460         # Confirm age
461         age_form = {
462                 'next_url':     '/',
463                 'action_confirm':   'Confirm',
464                 }
465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
466         try:
467             self.report_age_confirmation()
468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
471             return
472
473     def _extract_id(self, url):
474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
475         if mobj is None:
476             self._downloader.report_error(u'invalid URL: %s' % url)
477             return
478         video_id = mobj.group(2)
479         return video_id
480
481     def _real_extract(self, url):
482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483         mobj = re.search(self._NEXT_URL_RE, url)
484         if mobj:
485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486         video_id = self._extract_id(url)
487
488         # Get video webpage
489         self.report_video_webpage_download(video_id)
490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491         request = compat_urllib_request.Request(url)
492         try:
493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
496             return
497
498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
499
500         # Attempt to extract SWF player URL
501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
502         if mobj is not None:
503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504         else:
505             player_url = None
506
507         # Get video info
508         self.report_video_info_webpage_download(video_id)
509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511                     % (video_id, el_type))
512             video_info_webpage = self._download_webpage(video_info_url, video_id,
513                                     note=False,
514                                     errnote='unable to download video info webpage')
515             video_info = compat_parse_qs(video_info_webpage)
516             if 'token' in video_info:
517                 break
518         if 'token' not in video_info:
519             if 'reason' in video_info:
520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
521             else:
522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
523             return
524
525         # Check for "rental" videos
526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527             self._downloader.report_error(u'"rental" videos not supported')
528             return
529
530         # Start extracting information
531         self.report_information_extraction(video_id)
532
533         # uploader
534         if 'author' not in video_info:
535             self._downloader.report_error(u'unable to extract uploader name')
536             return
537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
538
539         # uploader_id
540         video_uploader_id = None
541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542         if mobj is not None:
543             video_uploader_id = mobj.group(1)
544         else:
545             self._downloader.report_warning(u'unable to extract uploader nickname')
546
547         # title
548         if 'title' not in video_info:
549             self._downloader.report_error(u'unable to extract video title')
550             return
551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552
553         # thumbnail image
554         if 'thumbnail_url' not in video_info:
555             self._downloader.report_warning(u'unable to extract video thumbnail')
556             video_thumbnail = ''
557         else:   # don't panic if we can't find it
558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
559
560         # upload date
561         upload_date = None
562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
563         if mobj is not None:
564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
566             for expression in format_expressions:
567                 try:
568                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
569                 except:
570                     pass
571
572         # description
573         video_description = get_element_by_id("eow-description", video_webpage)
574         if video_description:
575             video_description = clean_html(video_description)
576         else:
577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
578             if fd_mobj:
579                 video_description = unescapeHTML(fd_mobj.group(1))
580             else:
581                 video_description = u''
582
583         # subtitles
584         video_subtitles = None
585
586         if self._downloader.params.get('writesubtitles', False):
587             video_subtitles = self._extract_subtitle(video_id)
588             if video_subtitles:
589                 (sub_error, sub_lang, sub) = video_subtitles[0]
590                 if sub_error:
591                     self._downloader.report_error(sub_error)
592
593         if self._downloader.params.get('allsubtitles', False):
594             video_subtitles = self._extract_all_subtitles(video_id)
595             for video_subtitle in video_subtitles:
596                 (sub_error, sub_lang, sub) = video_subtitle
597                 if sub_error:
598                     self._downloader.report_error(sub_error)
599
600         if self._downloader.params.get('listsubtitles', False):
601             sub_lang_list = self._list_available_subtitles(video_id)
602             return
603
604         if 'length_seconds' not in video_info:
605             self._downloader.report_warning(u'unable to extract video duration')
606             video_duration = ''
607         else:
608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
609
610         # token
611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
612
613         # Decide which formats to download
614         req_format = self._downloader.params.get('format', None)
615
616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617             self.report_rtmp_download()
618             video_url_list = [(None, video_info['conn'][0])]
619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
624
625             format_limit = self._downloader.params.get('format_limit', None)
626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627             if format_limit is not None and format_limit in available_formats:
628                 format_list = available_formats[available_formats.index(format_limit):]
629             else:
630                 format_list = available_formats
631             existing_formats = [x for x in format_list if x in url_map]
632             if len(existing_formats) == 0:
633                 raise ExtractorError(u'no known formats available for video')
634             if self._downloader.params.get('listformats', None):
635                 self._print_formats(existing_formats)
636                 return
637             if req_format is None or req_format == 'best':
638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639             elif req_format == 'worst':
640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641             elif req_format in ('-1', 'all'):
642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
643             else:
644                 # Specific formats. We pick the first in a slash-delimeted sequence.
645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646                 req_formats = req_format.split('/')
647                 video_url_list = None
648                 for rf in req_formats:
649                     if rf in url_map:
650                         video_url_list = [(rf, url_map[rf])]
651                         break
652                 if video_url_list is None:
653                     raise ExtractorError(u'requested format not available')
654         else:
655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
656
657         results = []
658         for format_param, video_real_url in video_url_list:
659             # Extension
660             video_extension = self._video_extensions.get(format_param, 'flv')
661
662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663                                               self._video_dimensions.get(format_param, '???'))
664
665             results.append({
666                 'id':       video_id,
667                 'url':      video_real_url,
668                 'uploader': video_uploader,
669                 'uploader_id': video_uploader_id,
670                 'upload_date':  upload_date,
671                 'title':    video_title,
672                 'ext':      video_extension,
673                 'format':   video_format,
674                 'thumbnail':    video_thumbnail,
675                 'description':  video_description,
676                 'player_url':   player_url,
677                 'subtitles':    video_subtitles,
678                 'duration':     video_duration
679             })
680         return results
681
682
683 class MetacafeIE(InfoExtractor):
684     """Information Extractor for metacafe.com."""
685
686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689     IE_NAME = u'metacafe'
690
691     def report_disclaimer(self):
692         """Report disclaimer retrieval."""
693         self.to_screen(u'Retrieving disclaimer')
694
695     def _real_initialize(self):
696         # Retrieve disclaimer
697         request = compat_urllib_request.Request(self._DISCLAIMER)
698         try:
699             self.report_disclaimer()
700             disclaimer = compat_urllib_request.urlopen(request).read()
701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703             return
704
705         # Confirm age
706         disclaimer_form = {
707             'filters': '0',
708             'submit': "Continue - I'm over 18",
709             }
710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
711         try:
712             self.report_age_confirmation()
713             disclaimer = compat_urllib_request.urlopen(request).read()
714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
716             return
717
718     def _real_extract(self, url):
719         # Extract id and simplified title from URL
720         mobj = re.match(self._VALID_URL, url)
721         if mobj is None:
722             self._downloader.report_error(u'invalid URL: %s' % url)
723             return
724
725         video_id = mobj.group(1)
726
727         # Check if video comes from YouTube
728         mobj2 = re.match(r'^yt-(.*)$', video_id)
729         if mobj2 is not None:
730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
731
732         # Retrieve video webpage to extract further information
733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
734
735         # Extract URL, uploader and title from webpage
736         self.report_extraction(video_id)
737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
738         if mobj is not None:
739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740             video_extension = mediaURL[-3:]
741
742             # Extract gdaKey if available
743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
744             if mobj is None:
745                 video_url = mediaURL
746             else:
747                 gdaKey = mobj.group(1)
748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
749         else:
750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
751             if mobj is None:
752                 self._downloader.report_error(u'unable to extract media URL')
753                 return
754             vardict = compat_parse_qs(mobj.group(1))
755             if 'mediaData' not in vardict:
756                 self._downloader.report_error(u'unable to extract media URL')
757                 return
758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
759             if mobj is None:
760                 self._downloader.report_error(u'unable to extract media URL')
761                 return
762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763             video_extension = mediaURL[-3:]
764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
765
766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
767         if mobj is None:
768             self._downloader.report_error(u'unable to extract title')
769             return
770         video_title = mobj.group(1).decode('utf-8')
771
772         mobj = re.search(r'submitter=(.*?);', webpage)
773         if mobj is None:
774             self._downloader.report_error(u'unable to extract uploader nickname')
775             return
776         video_uploader = mobj.group(1)
777
778         return [{
779             'id':       video_id.decode('utf-8'),
780             'url':      video_url.decode('utf-8'),
781             'uploader': video_uploader.decode('utf-8'),
782             'upload_date':  None,
783             'title':    video_title,
784             'ext':      video_extension.decode('utf-8'),
785         }]
786
787
788 class DailymotionIE(InfoExtractor):
789     """Information Extractor for Dailymotion"""
790
791     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
792     IE_NAME = u'dailymotion'
793     _WORKING = False
794
795     def _real_extract(self, url):
796         # Extract id and simplified title from URL
797         mobj = re.match(self._VALID_URL, url)
798         if mobj is None:
799             self._downloader.report_error(u'invalid URL: %s' % url)
800             return
801
802         video_id = mobj.group(1).split('_')[0].split('?')[0]
803
804         video_extension = 'mp4'
805
806         # Retrieve video webpage to extract further information
807         request = compat_urllib_request.Request(url)
808         request.add_header('Cookie', 'family_filter=off')
809         webpage = self._download_webpage(request, video_id)
810
811         # Extract URL, uploader and title from webpage
812         self.report_extraction(video_id)
813         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
814         if mobj is None:
815             self._downloader.report_error(u'unable to extract media URL')
816             return
817         flashvars = compat_urllib_parse.unquote(mobj.group(1))
818
819         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
820             if key in flashvars:
821                 max_quality = key
822                 self.to_screen(u'Using %s' % key)
823                 break
824         else:
825             self._downloader.report_error(u'unable to extract video URL')
826             return
827
828         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
829         if mobj is None:
830             self._downloader.report_error(u'unable to extract video URL')
831             return
832
833         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
834
835         # TODO: support choosing qualities
836
837         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
838         if mobj is None:
839             self._downloader.report_error(u'unable to extract title')
840             return
841         video_title = unescapeHTML(mobj.group('title'))
842
843         video_uploader = None
844         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
845         if mobj is None:
846             # lookin for official user
847             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
848             if mobj_official is None:
849                 self._downloader.report_warning(u'unable to extract uploader nickname')
850             else:
851                 video_uploader = mobj_official.group(1)
852         else:
853             video_uploader = mobj.group(1)
854
855         video_upload_date = None
856         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
857         if mobj is not None:
858             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
859
860         return [{
861             'id':       video_id,
862             'url':      video_url,
863             'uploader': video_uploader,
864             'upload_date':  video_upload_date,
865             'title':    video_title,
866             'ext':      video_extension,
867         }]
868
869
870 class PhotobucketIE(InfoExtractor):
871     """Information extractor for photobucket.com."""
872
873     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
874     IE_NAME = u'photobucket'
875
876     def _real_extract(self, url):
877         # Extract id from URL
878         mobj = re.match(self._VALID_URL, url)
879         if mobj is None:
880             self._downloader.report_error(u'Invalid URL: %s' % url)
881             return
882
883         video_id = mobj.group(1)
884
885         video_extension = 'flv'
886
887         # Retrieve video webpage to extract further information
888         request = compat_urllib_request.Request(url)
889         try:
890             self.report_download_webpage(video_id)
891             webpage = compat_urllib_request.urlopen(request).read()
892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
894             return
895
896         # Extract URL, uploader, and title from webpage
897         self.report_extraction(video_id)
898         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
899         if mobj is None:
900             self._downloader.report_error(u'unable to extract media URL')
901             return
902         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
903
904         video_url = mediaURL
905
906         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
907         if mobj is None:
908             self._downloader.report_error(u'unable to extract title')
909             return
910         video_title = mobj.group(1).decode('utf-8')
911
912         video_uploader = mobj.group(2).decode('utf-8')
913
914         return [{
915             'id':       video_id.decode('utf-8'),
916             'url':      video_url.decode('utf-8'),
917             'uploader': video_uploader,
918             'upload_date':  None,
919             'title':    video_title,
920             'ext':      video_extension.decode('utf-8'),
921         }]
922
923
924 class YahooIE(InfoExtractor):
925     """Information extractor for video.yahoo.com."""
926
927     _WORKING = False
928     # _VALID_URL matches all Yahoo! Video URLs
929     # _VPAGE_URL matches only the extractable '/watch/' URLs
930     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
931     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
932     IE_NAME = u'video.yahoo'
933
934     def _real_extract(self, url, new_video=True):
935         # Extract ID from URL
936         mobj = re.match(self._VALID_URL, url)
937         if mobj is None:
938             self._downloader.report_error(u'Invalid URL: %s' % url)
939             return
940
941         video_id = mobj.group(2)
942         video_extension = 'flv'
943
944         # Rewrite valid but non-extractable URLs as
945         # extractable English language /watch/ URLs
946         if re.match(self._VPAGE_URL, url) is None:
947             request = compat_urllib_request.Request(url)
948             try:
949                 webpage = compat_urllib_request.urlopen(request).read()
950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
952                 return
953
954             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
955             if mobj is None:
956                 self._downloader.report_error(u'Unable to extract id field')
957                 return
958             yahoo_id = mobj.group(1)
959
960             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
961             if mobj is None:
962                 self._downloader.report_error(u'Unable to extract vid field')
963                 return
964             yahoo_vid = mobj.group(1)
965
966             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
967             return self._real_extract(url, new_video=False)
968
969         # Retrieve video webpage to extract further information
970         request = compat_urllib_request.Request(url)
971         try:
972             self.report_download_webpage(video_id)
973             webpage = compat_urllib_request.urlopen(request).read()
974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
976             return
977
978         # Extract uploader and title from webpage
979         self.report_extraction(video_id)
980         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
981         if mobj is None:
982             self._downloader.report_error(u'unable to extract video title')
983             return
984         video_title = mobj.group(1).decode('utf-8')
985
986         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
987         if mobj is None:
988             self._downloader.report_error(u'unable to extract video uploader')
989             return
990         video_uploader = mobj.group(1).decode('utf-8')
991
992         # Extract video thumbnail
993         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
994         if mobj is None:
995             self._downloader.report_error(u'unable to extract video thumbnail')
996             return
997         video_thumbnail = mobj.group(1).decode('utf-8')
998
999         # Extract video description
1000         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.report_error(u'unable to extract video description')
1003             return
1004         video_description = mobj.group(1).decode('utf-8')
1005         if not video_description:
1006             video_description = 'No description available.'
1007
1008         # Extract video height and width
1009         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video height')
1012             return
1013         yv_video_height = mobj.group(1)
1014
1015         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video width')
1018             return
1019         yv_video_width = mobj.group(1)
1020
1021         # Retrieve video playlist to extract media URL
1022         # I'm not completely sure what all these options are, but we
1023         # seem to need most of them, otherwise the server sends a 401.
1024         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1025         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1026         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1029         try:
1030             self.report_download_webpage(video_id)
1031             webpage = compat_urllib_request.urlopen(request).read()
1032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034             return
1035
1036         # Extract media URL from playlist XML
1037         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1038         if mobj is None:
1039             self._downloader.report_error(u'Unable to extract media URL')
1040             return
1041         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042         video_url = unescapeHTML(video_url)
1043
1044         return [{
1045             'id':       video_id.decode('utf-8'),
1046             'url':      video_url,
1047             'uploader': video_uploader,
1048             'upload_date':  None,
1049             'title':    video_title,
1050             'ext':      video_extension.decode('utf-8'),
1051             'thumbnail':    video_thumbnail.decode('utf-8'),
1052             'description':  video_description,
1053         }]
1054
1055
1056 class VimeoIE(InfoExtractor):
1057     """Information extractor for vimeo.com."""
1058
1059     # _VALID_URL matches Vimeo URLs
1060     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061     IE_NAME = u'vimeo'
1062
1063     def _real_extract(self, url, new_video=True):
1064         # Extract ID from URL
1065         mobj = re.match(self._VALID_URL, url)
1066         if mobj is None:
1067             self._downloader.report_error(u'Invalid URL: %s' % url)
1068             return
1069
1070         video_id = mobj.group('id')
1071         if not mobj.group('proto'):
1072             url = 'https://' + url
1073         if mobj.group('direct_link'):
1074             url = 'https://vimeo.com/' + video_id
1075
1076         # Retrieve video webpage to extract further information
1077         request = compat_urllib_request.Request(url, None, std_headers)
1078         try:
1079             self.report_download_webpage(video_id)
1080             webpage_bytes = compat_urllib_request.urlopen(request).read()
1081             webpage = webpage_bytes.decode('utf-8')
1082         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1084             return
1085
1086         # Now we begin extracting as much information as we can from what we
1087         # retrieved. First we extract the information common to all extractors,
1088         # and latter we extract those that are Vimeo specific.
1089         self.report_extraction(video_id)
1090
1091         # Extract the config JSON
1092         try:
1093             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094             config = json.loads(config)
1095         except:
1096             self._downloader.report_error(u'unable to extract info section')
1097             return
1098
1099         # Extract title
1100         video_title = config["video"]["title"]
1101
1102         # Extract uploader and uploader_id
1103         video_uploader = config["video"]["owner"]["name"]
1104         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1105
1106         # Extract video thumbnail
1107         video_thumbnail = config["video"]["thumbnail"]
1108
1109         # Extract video description
1110         video_description = get_element_by_attribute("itemprop", "description", webpage)
1111         if video_description: video_description = clean_html(video_description)
1112         else: video_description = u''
1113
1114         # Extract upload date
1115         video_upload_date = None
1116         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1117         if mobj is not None:
1118             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1119
1120         # Vimeo specific: extract request signature and timestamp
1121         sig = config['request']['signature']
1122         timestamp = config['request']['timestamp']
1123
1124         # Vimeo specific: extract video codec and quality information
1125         # First consider quality, then codecs, then take everything
1126         # TODO bind to format param
1127         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1128         files = { 'hd': [], 'sd': [], 'other': []}
1129         for codec_name, codec_extension in codecs:
1130             if codec_name in config["video"]["files"]:
1131                 if 'hd' in config["video"]["files"][codec_name]:
1132                     files['hd'].append((codec_name, codec_extension, 'hd'))
1133                 elif 'sd' in config["video"]["files"][codec_name]:
1134                     files['sd'].append((codec_name, codec_extension, 'sd'))
1135                 else:
1136                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1137
1138         for quality in ('hd', 'sd', 'other'):
1139             if len(files[quality]) > 0:
1140                 video_quality = files[quality][0][2]
1141                 video_codec = files[quality][0][0]
1142                 video_extension = files[quality][0][1]
1143                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1144                 break
1145         else:
1146             self._downloader.report_error(u'no known codec found')
1147             return
1148
1149         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1150                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1151
1152         return [{
1153             'id':       video_id,
1154             'url':      video_url,
1155             'uploader': video_uploader,
1156             'uploader_id': video_uploader_id,
1157             'upload_date':  video_upload_date,
1158             'title':    video_title,
1159             'ext':      video_extension,
1160             'thumbnail':    video_thumbnail,
1161             'description':  video_description,
1162         }]
1163
1164
1165 class ArteTvIE(InfoExtractor):
1166     """arte.tv information extractor."""
1167
1168     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1169     _LIVE_URL = r'index-[0-9]+\.html$'
1170
1171     IE_NAME = u'arte.tv'
1172
1173     def fetch_webpage(self, url):
1174         request = compat_urllib_request.Request(url)
1175         try:
1176             self.report_download_webpage(url)
1177             webpage = compat_urllib_request.urlopen(request).read()
1178         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1179             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1180             return
1181         except ValueError as err:
1182             self._downloader.report_error(u'Invalid URL: %s' % url)
1183             return
1184         return webpage
1185
1186     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1187         page = self.fetch_webpage(url)
1188         mobj = re.search(regex, page, regexFlags)
1189         info = {}
1190
1191         if mobj is None:
1192             self._downloader.report_error(u'Invalid URL: %s' % url)
1193             return
1194
1195         for (i, key, err) in matchTuples:
1196             if mobj.group(i) is None:
1197                 self._downloader.report_error(err)
1198                 return
1199             else:
1200                 info[key] = mobj.group(i)
1201
1202         return info
1203
1204     def extractLiveStream(self, url):
1205         video_lang = url.split('/')[-4]
1206         info = self.grep_webpage(
1207             url,
1208             r'src="(.*?/videothek_js.*?\.js)',
1209             0,
1210             [
1211                 (1, 'url', u'Invalid URL: %s' % url)
1212             ]
1213         )
1214         http_host = url.split('/')[2]
1215         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1216         info = self.grep_webpage(
1217             next_url,
1218             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1219                 '(http://.*?\.swf).*?' +
1220                 '(rtmp://.*?)\'',
1221             re.DOTALL,
1222             [
1223                 (1, 'path',   u'could not extract video path: %s' % url),
1224                 (2, 'player', u'could not extract video player: %s' % url),
1225                 (3, 'url',    u'could not extract video url: %s' % url)
1226             ]
1227         )
1228         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1229
1230     def extractPlus7Stream(self, url):
1231         video_lang = url.split('/')[-3]
1232         info = self.grep_webpage(
1233             url,
1234             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1235             0,
1236             [
1237                 (1, 'url', u'Invalid URL: %s' % url)
1238             ]
1239         )
1240         next_url = compat_urllib_parse.unquote(info.get('url'))
1241         info = self.grep_webpage(
1242             next_url,
1243             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1244             0,
1245             [
1246                 (1, 'url', u'Could not find <video> tag: %s' % url)
1247             ]
1248         )
1249         next_url = compat_urllib_parse.unquote(info.get('url'))
1250
1251         info = self.grep_webpage(
1252             next_url,
1253             r'<video id="(.*?)".*?>.*?' +
1254                 '<name>(.*?)</name>.*?' +
1255                 '<dateVideo>(.*?)</dateVideo>.*?' +
1256                 '<url quality="hd">(.*?)</url>',
1257             re.DOTALL,
1258             [
1259                 (1, 'id',    u'could not extract video id: %s' % url),
1260                 (2, 'title', u'could not extract video title: %s' % url),
1261                 (3, 'date',  u'could not extract video date: %s' % url),
1262                 (4, 'url',   u'could not extract video url: %s' % url)
1263             ]
1264         )
1265
1266         return {
1267             'id':           info.get('id'),
1268             'url':          compat_urllib_parse.unquote(info.get('url')),
1269             'uploader':     u'arte.tv',
1270             'upload_date':  info.get('date'),
1271             'title':        info.get('title').decode('utf-8'),
1272             'ext':          u'mp4',
1273             'format':       u'NA',
1274             'player_url':   None,
1275         }
1276
1277     def _real_extract(self, url):
1278         video_id = url.split('/')[-1]
1279         self.report_extraction(video_id)
1280
1281         if re.search(self._LIVE_URL, video_id) is not None:
1282             self.extractLiveStream(url)
1283             return
1284         else:
1285             info = self.extractPlus7Stream(url)
1286
1287         return [info]
1288
1289
1290 class GenericIE(InfoExtractor):
1291     """Generic last-resort information extractor."""
1292
1293     _VALID_URL = r'.*'
1294     IE_NAME = u'generic'
1295
1296     def report_download_webpage(self, video_id):
1297         """Report webpage download."""
1298         if not self._downloader.params.get('test', False):
1299             self._downloader.report_warning(u'Falling back on generic information extractor.')
1300         super(GenericIE, self).report_download_webpage(video_id)
1301
1302     def report_following_redirect(self, new_url):
1303         """Report information extraction."""
1304         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1305
1306     def _test_redirect(self, url):
1307         """Check if it is a redirect, like url shorteners, in case return the new url."""
1308         class HeadRequest(compat_urllib_request.Request):
1309             def get_method(self):
1310                 return "HEAD"
1311
1312         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1313             """
1314             Subclass the HTTPRedirectHandler to make it use our
1315             HeadRequest also on the redirected URL
1316             """
1317             def redirect_request(self, req, fp, code, msg, headers, newurl):
1318                 if code in (301, 302, 303, 307):
1319                     newurl = newurl.replace(' ', '%20')
1320                     newheaders = dict((k,v) for k,v in req.headers.items()
1321                                       if k.lower() not in ("content-length", "content-type"))
1322                     return HeadRequest(newurl,
1323                                        headers=newheaders,
1324                                        origin_req_host=req.get_origin_req_host(),
1325                                        unverifiable=True)
1326                 else:
1327                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1328
1329         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1330             """
1331             Fallback to GET if HEAD is not allowed (405 HTTP error)
1332             """
1333             def http_error_405(self, req, fp, code, msg, headers):
1334                 fp.read()
1335                 fp.close()
1336
1337                 newheaders = dict((k,v) for k,v in req.headers.items()
1338                                   if k.lower() not in ("content-length", "content-type"))
1339                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1340                                                  headers=newheaders,
1341                                                  origin_req_host=req.get_origin_req_host(),
1342                                                  unverifiable=True))
1343
1344         # Build our opener
1345         opener = compat_urllib_request.OpenerDirector()
1346         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1347                         HTTPMethodFallback, HEADRedirectHandler,
1348                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1349             opener.add_handler(handler())
1350
1351         response = opener.open(HeadRequest(url))
1352         new_url = response.geturl()
1353
1354         if url == new_url:
1355             return False
1356
1357         self.report_following_redirect(new_url)
1358         return new_url
1359
1360     def _real_extract(self, url):
1361         new_url = self._test_redirect(url)
1362         if new_url: return [self.url_result(new_url)]
1363
1364         video_id = url.split('/')[-1]
1365         try:
1366             webpage = self._download_webpage(url, video_id)
1367         except ValueError as err:
1368             # since this is the last-resort InfoExtractor, if
1369             # this error is thrown, it'll be thrown here
1370             self._downloader.report_error(u'Invalid URL: %s' % url)
1371             return
1372
1373         self.report_extraction(video_id)
1374         # Start with something easy: JW Player in SWFObject
1375         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             # Broaden the search a little bit
1378             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1379         if mobj is None:
1380             # Broaden the search a little bit: JWPlayer JS loader
1381             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1382         if mobj is None:
1383             self._downloader.report_error(u'Invalid URL: %s' % url)
1384             return
1385
1386         # It's possible that one of the regexes
1387         # matched, but returned an empty group:
1388         if mobj.group(1) is None:
1389             self._downloader.report_error(u'Invalid URL: %s' % url)
1390             return
1391
1392         video_url = compat_urllib_parse.unquote(mobj.group(1))
1393         video_id = os.path.basename(video_url)
1394
1395         # here's a fun little line of code for you:
1396         video_extension = os.path.splitext(video_id)[1][1:]
1397         video_id = os.path.splitext(video_id)[0]
1398
1399         # it's tempting to parse this further, but you would
1400         # have to take into account all the variations like
1401         #   Video Title - Site Name
1402         #   Site Name | Video Title
1403         #   Video Title - Tagline | Site Name
1404         # and so on and so forth; it's just not practical
1405         mobj = re.search(r'<title>(.*)</title>', webpage)
1406         if mobj is None:
1407             self._downloader.report_error(u'unable to extract title')
1408             return
1409         video_title = mobj.group(1)
1410
1411         # video uploader is domain name
1412         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1413         if mobj is None:
1414             self._downloader.report_error(u'unable to extract title')
1415             return
1416         video_uploader = mobj.group(1)
1417
1418         return [{
1419             'id':       video_id,
1420             'url':      video_url,
1421             'uploader': video_uploader,
1422             'upload_date':  None,
1423             'title':    video_title,
1424             'ext':      video_extension,
1425         }]
1426
1427
1428 class YoutubeSearchIE(InfoExtractor):
1429     """Information Extractor for YouTube search queries."""
1430     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1431     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1432     _max_youtube_results = 1000
1433     IE_NAME = u'youtube:search'
1434
1435     def report_download_page(self, query, pagenum):
1436         """Report attempt to download search page with given number."""
1437         query = query.decode(preferredencoding())
1438         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1439
1440     def _real_extract(self, query):
1441         mobj = re.match(self._VALID_URL, query)
1442         if mobj is None:
1443             self._downloader.report_error(u'invalid search query "%s"' % query)
1444             return
1445
1446         prefix, query = query.split(':')
1447         prefix = prefix[8:]
1448         query = query.encode('utf-8')
1449         if prefix == '':
1450             return self._get_n_results(query, 1)
1451         elif prefix == 'all':
1452             self._get_n_results(query, self._max_youtube_results)
1453         else:
1454             try:
1455                 n = int(prefix)
1456                 if n <= 0:
1457                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1458                     return
1459                 elif n > self._max_youtube_results:
1460                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1461                     n = self._max_youtube_results
1462                 return self._get_n_results(query, n)
1463             except ValueError: # parsing prefix as integer fails
1464                 return self._get_n_results(query, 1)
1465
1466     def _get_n_results(self, query, n):
1467         """Get a specified number of results for a query"""
1468
1469         video_ids = []
1470         pagenum = 0
1471         limit = n
1472
1473         while (50 * pagenum) < limit:
1474             self.report_download_page(query, pagenum+1)
1475             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1476             request = compat_urllib_request.Request(result_url)
1477             try:
1478                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1479             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1481                 return
1482             api_response = json.loads(data)['data']
1483
1484             if not 'items' in api_response:
1485                 self._downloader.report_error(u'[youtube] No video results')
1486                 return
1487
1488             new_ids = list(video['id'] for video in api_response['items'])
1489             video_ids += new_ids
1490
1491             limit = min(n, api_response['totalItems'])
1492             pagenum += 1
1493
1494         if len(video_ids) > n:
1495             video_ids = video_ids[:n]
1496         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1497         return videos
1498
1499
1500 class GoogleSearchIE(InfoExtractor):
1501     """Information Extractor for Google Video search queries."""
1502     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1503     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1504     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1505     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1506     _max_google_results = 1000
1507     IE_NAME = u'video.google:search'
1508
1509     def report_download_page(self, query, pagenum):
1510         """Report attempt to download playlist page with given number."""
1511         query = query.decode(preferredencoding())
1512         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1513
1514     def _real_extract(self, query):
1515         mobj = re.match(self._VALID_URL, query)
1516         if mobj is None:
1517             self._downloader.report_error(u'invalid search query "%s"' % query)
1518             return
1519
1520         prefix, query = query.split(':')
1521         prefix = prefix[8:]
1522         query = query.encode('utf-8')
1523         if prefix == '':
1524             self._download_n_results(query, 1)
1525             return
1526         elif prefix == 'all':
1527             self._download_n_results(query, self._max_google_results)
1528             return
1529         else:
1530             try:
1531                 n = int(prefix)
1532                 if n <= 0:
1533                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1534                     return
1535                 elif n > self._max_google_results:
1536                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1537                     n = self._max_google_results
1538                 self._download_n_results(query, n)
1539                 return
1540             except ValueError: # parsing prefix as integer fails
1541                 self._download_n_results(query, 1)
1542                 return
1543
1544     def _download_n_results(self, query, n):
1545         """Downloads a specified number of results for a query"""
1546
1547         video_ids = []
1548         pagenum = 0
1549
1550         while True:
1551             self.report_download_page(query, pagenum)
1552             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1553             request = compat_urllib_request.Request(result_url)
1554             try:
1555                 page = compat_urllib_request.urlopen(request).read()
1556             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1557                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1558                 return
1559
1560             # Extract video identifiers
1561             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1562                 video_id = mobj.group(1)
1563                 if video_id not in video_ids:
1564                     video_ids.append(video_id)
1565                     if len(video_ids) == n:
1566                         # Specified n videos reached
1567                         for id in video_ids:
1568                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1569                         return
1570
1571             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1572                 for id in video_ids:
1573                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                 return
1575
1576             pagenum = pagenum + 1
1577
1578
1579 class YahooSearchIE(InfoExtractor):
1580     """Information Extractor for Yahoo! Video search queries."""
1581
1582     _WORKING = False
1583     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1584     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1585     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1586     _MORE_PAGES_INDICATOR = r'\s*Next'
1587     _max_yahoo_results = 1000
1588     IE_NAME = u'video.yahoo:search'
1589
1590     def report_download_page(self, query, pagenum):
1591         """Report attempt to download playlist page with given number."""
1592         query = query.decode(preferredencoding())
1593         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1594
1595     def _real_extract(self, query):
1596         mobj = re.match(self._VALID_URL, query)
1597         if mobj is None:
1598             self._downloader.report_error(u'invalid search query "%s"' % query)
1599             return
1600
1601         prefix, query = query.split(':')
1602         prefix = prefix[8:]
1603         query = query.encode('utf-8')
1604         if prefix == '':
1605             self._download_n_results(query, 1)
1606             return
1607         elif prefix == 'all':
1608             self._download_n_results(query, self._max_yahoo_results)
1609             return
1610         else:
1611             try:
1612                 n = int(prefix)
1613                 if n <= 0:
1614                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1615                     return
1616                 elif n > self._max_yahoo_results:
1617                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1618                     n = self._max_yahoo_results
1619                 self._download_n_results(query, n)
1620                 return
1621             except ValueError: # parsing prefix as integer fails
1622                 self._download_n_results(query, 1)
1623                 return
1624
1625     def _download_n_results(self, query, n):
1626         """Downloads a specified number of results for a query"""
1627
1628         video_ids = []
1629         already_seen = set()
1630         pagenum = 1
1631
1632         while True:
1633             self.report_download_page(query, pagenum)
1634             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1635             request = compat_urllib_request.Request(result_url)
1636             try:
1637                 page = compat_urllib_request.urlopen(request).read()
1638             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1639                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1640                 return
1641
1642             # Extract video identifiers
1643             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1644                 video_id = mobj.group(1)
1645                 if video_id not in already_seen:
1646                     video_ids.append(video_id)
1647                     already_seen.add(video_id)
1648                     if len(video_ids) == n:
1649                         # Specified n videos reached
1650                         for id in video_ids:
1651                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1652                         return
1653
1654             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1655                 for id in video_ids:
1656                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657                 return
1658
1659             pagenum = pagenum + 1
1660
1661
1662 class YoutubePlaylistIE(InfoExtractor):
1663     """Information Extractor for YouTube playlists."""
1664
1665     _VALID_URL = r"""(?:
1666                         (?:https?://)?
1667                         (?:\w+\.)?
1668                         youtube\.com/
1669                         (?:
1670                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1671                            \? (?:.*?&)*? (?:p|a|list)=
1672                         |  p/
1673                         )
1674                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1675                         .*
1676                      |
1677                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1678                      )"""
1679     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1680     _MAX_RESULTS = 50
1681     IE_NAME = u'youtube:playlist'
1682
1683     @classmethod
1684     def suitable(cls, url):
1685         """Receives a URL and returns True if suitable for this IE."""
1686         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1687
1688     def report_download_page(self, playlist_id, pagenum):
1689         """Report attempt to download playlist page with given number."""
1690         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1691
1692     def _real_extract(self, url):
1693         # Extract playlist id
1694         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1695         if mobj is None:
1696             self._downloader.report_error(u'invalid url: %s' % url)
1697             return
1698
1699         # Download playlist videos from API
1700         playlist_id = mobj.group(1) or mobj.group(2)
1701         page_num = 1
1702         videos = []
1703
1704         while True:
1705             self.report_download_page(playlist_id, page_num)
1706
1707             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1708             try:
1709                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1710             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1711                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1712                 return
1713
1714             try:
1715                 response = json.loads(page)
1716             except ValueError as err:
1717                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1718                 return
1719
1720             if 'feed' not in response:
1721                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1722                 return
1723             if 'entry' not in response['feed']:
1724                 # Number of videos is a multiple of self._MAX_RESULTS
1725                 break
1726
1727             playlist_title = response['feed']['title']['$t']
1728
1729             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1730                         for entry in response['feed']['entry']
1731                         if 'content' in entry ]
1732
1733             if len(response['feed']['entry']) < self._MAX_RESULTS:
1734                 break
1735             page_num += 1
1736
1737         videos = [v[1] for v in sorted(videos)]
1738
1739         url_results = [self.url_result(url, 'Youtube') for url in videos]
1740         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1741
1742
1743 class YoutubeChannelIE(InfoExtractor):
1744     """Information Extractor for YouTube channels."""
1745
1746     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1747     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1748     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1749     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1750     IE_NAME = u'youtube:channel'
1751
1752     def report_download_page(self, channel_id, pagenum):
1753         """Report attempt to download channel page with given number."""
1754         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1755
1756     def extract_videos_from_page(self, page):
1757         ids_in_page = []
1758         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1759             if mobj.group(1) not in ids_in_page:
1760                 ids_in_page.append(mobj.group(1))
1761         return ids_in_page
1762
1763     def _real_extract(self, url):
1764         # Extract channel id
1765         mobj = re.match(self._VALID_URL, url)
1766         if mobj is None:
1767             self._downloader.report_error(u'invalid url: %s' % url)
1768             return
1769
1770         # Download channel page
1771         channel_id = mobj.group(1)
1772         video_ids = []
1773         pagenum = 1
1774
1775         self.report_download_page(channel_id, pagenum)
1776         url = self._TEMPLATE_URL % (channel_id, pagenum)
1777         request = compat_urllib_request.Request(url)
1778         try:
1779             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1780         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1781             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1782             return
1783
1784         # Extract video identifiers
1785         ids_in_page = self.extract_videos_from_page(page)
1786         video_ids.extend(ids_in_page)
1787
1788         # Download any subsequent channel pages using the json-based channel_ajax query
1789         if self._MORE_PAGES_INDICATOR in page:
1790             while True:
1791                 pagenum = pagenum + 1
1792
1793                 self.report_download_page(channel_id, pagenum)
1794                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1795                 request = compat_urllib_request.Request(url)
1796                 try:
1797                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1798                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1799                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1800                     return
1801
1802                 page = json.loads(page)
1803
1804                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1805                 video_ids.extend(ids_in_page)
1806
1807                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1808                     break
1809
1810         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1811
1812         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1813         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1814         return [self.playlist_result(url_entries, channel_id)]
1815
1816
1817 class YoutubeUserIE(InfoExtractor):
1818     """Information Extractor for YouTube users."""
1819
1820     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1821     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1822     _GDATA_PAGE_SIZE = 50
1823     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1824     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1825     IE_NAME = u'youtube:user'
1826
1827     def report_download_page(self, username, start_index):
1828         """Report attempt to download user page."""
1829         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1830                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1831
1832     def _real_extract(self, url):
1833         # Extract username
1834         mobj = re.match(self._VALID_URL, url)
1835         if mobj is None:
1836             self._downloader.report_error(u'invalid url: %s' % url)
1837             return
1838
1839         username = mobj.group(1)
1840
1841         # Download video ids using YouTube Data API. Result size per
1842         # query is limited (currently to 50 videos) so we need to query
1843         # page by page until there are no video ids - it means we got
1844         # all of them.
1845
1846         video_ids = []
1847         pagenum = 0
1848
1849         while True:
1850             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1851             self.report_download_page(username, start_index)
1852
1853             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1854
1855             try:
1856                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1857             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1859                 return
1860
1861             # Extract video identifiers
1862             ids_in_page = []
1863
1864             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1865                 if mobj.group(1) not in ids_in_page:
1866                     ids_in_page.append(mobj.group(1))
1867
1868             video_ids.extend(ids_in_page)
1869
1870             # A little optimization - if current page is not
1871             # "full", ie. does not contain PAGE_SIZE video ids then
1872             # we can assume that this page is the last one - there
1873             # are no more ids on further pages - no need to query
1874             # again.
1875
1876             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1877                 break
1878
1879             pagenum += 1
1880
1881         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1882         url_results = [self.url_result(url, 'Youtube') for url in urls]
1883         return [self.playlist_result(url_results, playlist_title = username)]
1884
1885
1886 class BlipTVUserIE(InfoExtractor):
1887     """Information Extractor for blip.tv users."""
1888
1889     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1890     _PAGE_SIZE = 12
1891     IE_NAME = u'blip.tv:user'
1892
1893     def report_download_page(self, username, pagenum):
1894         """Report attempt to download user page."""
1895         self.to_screen(u'user %s: Downloading video ids from page %d' %
1896                 (username, pagenum))
1897
1898     def _real_extract(self, url):
1899         # Extract username
1900         mobj = re.match(self._VALID_URL, url)
1901         if mobj is None:
1902             self._downloader.report_error(u'invalid url: %s' % url)
1903             return
1904
1905         username = mobj.group(1)
1906
1907         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1908
1909         request = compat_urllib_request.Request(url)
1910
1911         try:
1912             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1913             mobj = re.search(r'data-users-id="([^"]+)"', page)
1914             page_base = page_base % mobj.group(1)
1915         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1916             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1917             return
1918
1919
1920         # Download video ids using BlipTV Ajax calls. Result size per
1921         # query is limited (currently to 12 videos) so we need to query
1922         # page by page until there are no video ids - it means we got
1923         # all of them.
1924
1925         video_ids = []
1926         pagenum = 1
1927
1928         while True:
1929             self.report_download_page(username, pagenum)
1930             url = page_base + "&page=" + str(pagenum)
1931             request = compat_urllib_request.Request( url )
1932             try:
1933                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1934             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1935                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1936                 return
1937
1938             # Extract video identifiers
1939             ids_in_page = []
1940
1941             for mobj in re.finditer(r'href="/([^"]+)"', page):
1942                 if mobj.group(1) not in ids_in_page:
1943                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1944
1945             video_ids.extend(ids_in_page)
1946
1947             # A little optimization - if current page is not
1948             # "full", ie. does not contain PAGE_SIZE video ids then
1949             # we can assume that this page is the last one - there
1950             # are no more ids on further pages - no need to query
1951             # again.
1952
1953             if len(ids_in_page) < self._PAGE_SIZE:
1954                 break
1955
1956             pagenum += 1
1957
1958         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1959         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1960         return [self.playlist_result(url_entries, playlist_title = username)]
1961
1962
1963 class DepositFilesIE(InfoExtractor):
1964     """Information extractor for depositfiles.com"""
1965
1966     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1967
1968     def _real_extract(self, url):
1969         file_id = url.split('/')[-1]
1970         # Rebuild url in english locale
1971         url = 'http://depositfiles.com/en/files/' + file_id
1972
1973         # Retrieve file webpage with 'Free download' button pressed
1974         free_download_indication = { 'gateway_result' : '1' }
1975         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1976         try:
1977             self.report_download_webpage(file_id)
1978             webpage = compat_urllib_request.urlopen(request).read()
1979         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1981             return
1982
1983         # Search for the real file URL
1984         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1985         if (mobj is None) or (mobj.group(1) is None):
1986             # Try to figure out reason of the error.
1987             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1988             if (mobj is not None) and (mobj.group(1) is not None):
1989                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1990                 self._downloader.report_error(u'%s' % restriction_message)
1991             else:
1992                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1993             return
1994
1995         file_url = mobj.group(1)
1996         file_extension = os.path.splitext(file_url)[1][1:]
1997
1998         # Search for file title
1999         mobj = re.search(r'<b title="(.*?)">', webpage)
2000         if mobj is None:
2001             self._downloader.report_error(u'unable to extract title')
2002             return
2003         file_title = mobj.group(1).decode('utf-8')
2004
2005         return [{
2006             'id':       file_id.decode('utf-8'),
2007             'url':      file_url.decode('utf-8'),
2008             'uploader': None,
2009             'upload_date':  None,
2010             'title':    file_title,
2011             'ext':      file_extension.decode('utf-8'),
2012         }]
2013
2014
2015 class FacebookIE(InfoExtractor):
2016     """Information Extractor for Facebook"""
2017
2018     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2019     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2020     _NETRC_MACHINE = 'facebook'
2021     IE_NAME = u'facebook'
2022
2023     def report_login(self):
2024         """Report attempt to log in."""
2025         self.to_screen(u'Logging in')
2026
2027     def _real_initialize(self):
2028         if self._downloader is None:
2029             return
2030
2031         useremail = None
2032         password = None
2033         downloader_params = self._downloader.params
2034
2035         # Attempt to use provided username and password or .netrc data
2036         if downloader_params.get('username', None) is not None:
2037             useremail = downloader_params['username']
2038             password = downloader_params['password']
2039         elif downloader_params.get('usenetrc', False):
2040             try:
2041                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2042                 if info is not None:
2043                     useremail = info[0]
2044                     password = info[2]
2045                 else:
2046                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2047             except (IOError, netrc.NetrcParseError) as err:
2048                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2049                 return
2050
2051         if useremail is None:
2052             return
2053
2054         # Log in
2055         login_form = {
2056             'email': useremail,
2057             'pass': password,
2058             'login': 'Log+In'
2059             }
2060         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2061         try:
2062             self.report_login()
2063             login_results = compat_urllib_request.urlopen(request).read()
2064             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2065                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2066                 return
2067         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2068             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2069             return
2070
2071     def _real_extract(self, url):
2072         mobj = re.match(self._VALID_URL, url)
2073         if mobj is None:
2074             self._downloader.report_error(u'invalid URL: %s' % url)
2075             return
2076         video_id = mobj.group('ID')
2077
2078         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2079         webpage = self._download_webpage(url, video_id)
2080
2081         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2082         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2083         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2084         if not m:
2085             raise ExtractorError(u'Cannot parse data')
2086         data = dict(json.loads(m.group(1)))
2087         params_raw = compat_urllib_parse.unquote(data['params'])
2088         params = json.loads(params_raw)
2089         video_data = params['video_data'][0]
2090         video_url = video_data.get('hd_src')
2091         if not video_url:
2092             video_url = video_data['sd_src']
2093         if not video_url:
2094             raise ExtractorError(u'Cannot find video URL')
2095         video_duration = int(video_data['video_duration'])
2096         thumbnail = video_data['thumbnail_src']
2097
2098         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2099         if not m:
2100             raise ExtractorError(u'Cannot find title in webpage')
2101         video_title = unescapeHTML(m.group(1))
2102
2103         info = {
2104             'id': video_id,
2105             'title': video_title,
2106             'url': video_url,
2107             'ext': 'mp4',
2108             'duration': video_duration,
2109             'thumbnail': thumbnail,
2110         }
2111         return [info]
2112
2113
2114 class BlipTVIE(InfoExtractor):
2115     """Information extractor for blip.tv"""
2116
2117     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2118     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2119     IE_NAME = u'blip.tv'
2120
2121     def report_direct_download(self, title):
2122         """Report information extraction."""
2123         self.to_screen(u'%s: Direct download detected' % title)
2124
2125     def _real_extract(self, url):
2126         mobj = re.match(self._VALID_URL, url)
2127         if mobj is None:
2128             self._downloader.report_error(u'invalid URL: %s' % url)
2129             return
2130
2131         urlp = compat_urllib_parse_urlparse(url)
2132         if urlp.path.startswith('/play/'):
2133             request = compat_urllib_request.Request(url)
2134             response = compat_urllib_request.urlopen(request)
2135             redirecturl = response.geturl()
2136             rurlp = compat_urllib_parse_urlparse(redirecturl)
2137             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2138             url = 'http://blip.tv/a/a-' + file_id
2139             return self._real_extract(url)
2140
2141
2142         if '?' in url:
2143             cchar = '&'
2144         else:
2145             cchar = '?'
2146         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2147         request = compat_urllib_request.Request(json_url)
2148         request.add_header('User-Agent', 'iTunes/10.6.1')
2149         self.report_extraction(mobj.group(1))
2150         info = None
2151         try:
2152             urlh = compat_urllib_request.urlopen(request)
2153             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2154                 basename = url.split('/')[-1]
2155                 title,ext = os.path.splitext(basename)
2156                 title = title.decode('UTF-8')
2157                 ext = ext.replace('.', '')
2158                 self.report_direct_download(title)
2159                 info = {
2160                     'id': title,
2161                     'url': url,
2162                     'uploader': None,
2163                     'upload_date': None,
2164                     'title': title,
2165                     'ext': ext,
2166                     'urlhandle': urlh
2167                 }
2168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2169             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2170         if info is None: # Regular URL
2171             try:
2172                 json_code_bytes = urlh.read()
2173                 json_code = json_code_bytes.decode('utf-8')
2174             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2175                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2176                 return
2177
2178             try:
2179                 json_data = json.loads(json_code)
2180                 if 'Post' in json_data:
2181                     data = json_data['Post']
2182                 else:
2183                     data = json_data
2184
2185                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2186                 video_url = data['media']['url']
2187                 umobj = re.match(self._URL_EXT, video_url)
2188                 if umobj is None:
2189                     raise ValueError('Can not determine filename extension')
2190                 ext = umobj.group(1)
2191
2192                 info = {
2193                     'id': data['item_id'],
2194                     'url': video_url,
2195                     'uploader': data['display_name'],
2196                     'upload_date': upload_date,
2197                     'title': data['title'],
2198                     'ext': ext,
2199                     'format': data['media']['mimeType'],
2200                     'thumbnail': data['thumbnailUrl'],
2201                     'description': data['description'],
2202                     'player_url': data['embedUrl'],
2203                     'user_agent': 'iTunes/10.6.1',
2204                 }
2205             except (ValueError,KeyError) as err:
2206                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2207                 return
2208
2209         return [info]
2210
2211
2212 class MyVideoIE(InfoExtractor):
2213     """Information Extractor for myvideo.de."""
2214
2215     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2216     IE_NAME = u'myvideo'
2217
2218     def _real_extract(self,url):
2219         mobj = re.match(self._VALID_URL, url)
2220         if mobj is None:
2221             self._download.report_error(u'invalid URL: %s' % url)
2222             return
2223
2224         video_id = mobj.group(1)
2225
2226         # Get video webpage
2227         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2228         webpage = self._download_webpage(webpage_url, video_id)
2229
2230         self.report_extraction(video_id)
2231         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2232                  webpage)
2233         if mobj is None:
2234             self._downloader.report_error(u'unable to extract media URL')
2235             return
2236         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2237
2238         mobj = re.search('<title>([^<]+)</title>', webpage)
2239         if mobj is None:
2240             self._downloader.report_error(u'unable to extract title')
2241             return
2242
2243         video_title = mobj.group(1)
2244
2245         return [{
2246             'id':       video_id,
2247             'url':      video_url,
2248             'uploader': None,
2249             'upload_date':  None,
2250             'title':    video_title,
2251             'ext':      u'flv',
2252         }]
2253
2254 class ComedyCentralIE(InfoExtractor):
2255     """Information extractor for The Daily Show and Colbert Report """
2256
2257     # urls can be abbreviations like :thedailyshow or :colbert
2258     # urls for episodes like:
2259     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2260     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2261     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2262     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2263                       |(https?://)?(www\.)?
2264                           (?P<showname>thedailyshow|colbertnation)\.com/
2265                          (full-episodes/(?P<episode>.*)|
2266                           (?P<clip>
2267                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2268                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2269                      $"""
2270
2271     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2272
2273     _video_extensions = {
2274         '3500': 'mp4',
2275         '2200': 'mp4',
2276         '1700': 'mp4',
2277         '1200': 'mp4',
2278         '750': 'mp4',
2279         '400': 'mp4',
2280     }
2281     _video_dimensions = {
2282         '3500': '1280x720',
2283         '2200': '960x540',
2284         '1700': '768x432',
2285         '1200': '640x360',
2286         '750': '512x288',
2287         '400': '384x216',
2288     }
2289
2290     @classmethod
2291     def suitable(cls, url):
2292         """Receives a URL and returns True if suitable for this IE."""
2293         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2294
2295     def report_config_download(self, episode_id, media_id):
2296         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2297
2298     def report_index_download(self, episode_id):
2299         self.to_screen(u'%s: Downloading show index' % episode_id)
2300
2301     def _print_formats(self, formats):
2302         print('Available formats:')
2303         for x in formats:
2304             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2305
2306
2307     def _real_extract(self, url):
2308         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2309         if mobj is None:
2310             self._downloader.report_error(u'invalid URL: %s' % url)
2311             return
2312
2313         if mobj.group('shortname'):
2314             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2315                 url = u'http://www.thedailyshow.com/full-episodes/'
2316             else:
2317                 url = u'http://www.colbertnation.com/full-episodes/'
2318             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2319             assert mobj is not None
2320
2321         if mobj.group('clip'):
2322             if mobj.group('showname') == 'thedailyshow':
2323                 epTitle = mobj.group('tdstitle')
2324             else:
2325                 epTitle = mobj.group('cntitle')
2326             dlNewest = False
2327         else:
2328             dlNewest = not mobj.group('episode')
2329             if dlNewest:
2330                 epTitle = mobj.group('showname')
2331             else:
2332                 epTitle = mobj.group('episode')
2333
2334         req = compat_urllib_request.Request(url)
2335         self.report_extraction(epTitle)
2336         try:
2337             htmlHandle = compat_urllib_request.urlopen(req)
2338             html = htmlHandle.read()
2339             webpage = html.decode('utf-8')
2340         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2341             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2342             return
2343         if dlNewest:
2344             url = htmlHandle.geturl()
2345             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2346             if mobj is None:
2347                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2348                 return
2349             if mobj.group('episode') == '':
2350                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2351                 return
2352             epTitle = mobj.group('episode')
2353
2354         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2355
2356         if len(mMovieParams) == 0:
2357             # The Colbert Report embeds the information in a without
2358             # a URL prefix; so extract the alternate reference
2359             # and then add the URL prefix manually.
2360
2361             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2362             if len(altMovieParams) == 0:
2363                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2364                 return
2365             else:
2366                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2367
2368         uri = mMovieParams[0][1]
2369         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2370         self.report_index_download(epTitle)
2371         try:
2372             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2374             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2375             return
2376
2377         results = []
2378
2379         idoc = xml.etree.ElementTree.fromstring(indexXml)
2380         itemEls = idoc.findall('.//item')
2381         for partNum,itemEl in enumerate(itemEls):
2382             mediaId = itemEl.findall('./guid')[0].text
2383             shortMediaId = mediaId.split(':')[-1]
2384             showId = mediaId.split(':')[-2].replace('.com', '')
2385             officialTitle = itemEl.findall('./title')[0].text
2386             officialDate = itemEl.findall('./pubDate')[0].text
2387
2388             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2389                         compat_urllib_parse.urlencode({'uri': mediaId}))
2390             configReq = compat_urllib_request.Request(configUrl)
2391             self.report_config_download(epTitle, shortMediaId)
2392             try:
2393                 configXml = compat_urllib_request.urlopen(configReq).read()
2394             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2395                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2396                 return
2397
2398             cdoc = xml.etree.ElementTree.fromstring(configXml)
2399             turls = []
2400             for rendition in cdoc.findall('.//rendition'):
2401                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2402                 turls.append(finfo)
2403
2404             if len(turls) == 0:
2405                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2406                 continue
2407
2408             if self._downloader.params.get('listformats', None):
2409                 self._print_formats([i[0] for i in turls])
2410                 return
2411
2412             # For now, just pick the highest bitrate
2413             format,rtmp_video_url = turls[-1]
2414
2415             # Get the format arg from the arg stream
2416             req_format = self._downloader.params.get('format', None)
2417
2418             # Select format if we can find one
2419             for f,v in turls:
2420                 if f == req_format:
2421                     format, rtmp_video_url = f, v
2422                     break
2423
2424             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2425             if not m:
2426                 raise ExtractorError(u'Cannot transform RTMP url')
2427             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2428             video_url = base + m.group('finalid')
2429
2430             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2431             info = {
2432                 'id': shortMediaId,
2433                 'url': video_url,
2434                 'uploader': showId,
2435                 'upload_date': officialDate,
2436                 'title': effTitle,
2437                 'ext': 'mp4',
2438                 'format': format,
2439                 'thumbnail': None,
2440                 'description': officialTitle,
2441             }
2442             results.append(info)
2443
2444         return results
2445
2446
2447 class EscapistIE(InfoExtractor):
2448     """Information extractor for The Escapist """
2449
2450     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2451     IE_NAME = u'escapist'
2452
2453     def report_config_download(self, showName):
2454         self.to_screen(u'%s: Downloading configuration' % showName)
2455
2456     def _real_extract(self, url):
2457         mobj = re.match(self._VALID_URL, url)
2458         if mobj is None:
2459             self._downloader.report_error(u'invalid URL: %s' % url)
2460             return
2461         showName = mobj.group('showname')
2462         videoId = mobj.group('episode')
2463
2464         self.report_extraction(showName)
2465         try:
2466             webPage = compat_urllib_request.urlopen(url)
2467             webPageBytes = webPage.read()
2468             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2469             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2472             return
2473
2474         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2475         description = unescapeHTML(descMatch.group(1))
2476         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2477         imgUrl = unescapeHTML(imgMatch.group(1))
2478         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2479         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2480         configUrlMatch = re.search('config=(.*)$', playerUrl)
2481         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2482
2483         self.report_config_download(showName)
2484         try:
2485             configJSON = compat_urllib_request.urlopen(configUrl)
2486             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2487             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2488         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2489             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2490             return
2491
2492         # Technically, it's JavaScript, not JSON
2493         configJSON = configJSON.replace("'", '"')
2494
2495         try:
2496             config = json.loads(configJSON)
2497         except (ValueError,) as err:
2498             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2499             return
2500
2501         playlist = config['playlist']
2502         videoUrl = playlist[1]['url']
2503
2504         info = {
2505             'id': videoId,
2506             'url': videoUrl,
2507             'uploader': showName,
2508             'upload_date': None,
2509             'title': showName,
2510             'ext': 'mp4',
2511             'thumbnail': imgUrl,
2512             'description': description,
2513             'player_url': playerUrl,
2514         }
2515
2516         return [info]
2517
2518 class CollegeHumorIE(InfoExtractor):
2519     """Information extractor for collegehumor.com"""
2520
2521     _WORKING = False
2522     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2523     IE_NAME = u'collegehumor'
2524
2525     def report_manifest(self, video_id):
2526         """Report information extraction."""
2527         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2528
2529     def _real_extract(self, url):
2530         mobj = re.match(self._VALID_URL, url)
2531         if mobj is None:
2532             self._downloader.report_error(u'invalid URL: %s' % url)
2533             return
2534         video_id = mobj.group('videoid')
2535
2536         info = {
2537             'id': video_id,
2538             'uploader': None,
2539             'upload_date': None,
2540         }
2541
2542         self.report_extraction(video_id)
2543         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2544         try:
2545             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2546         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2548             return
2549
2550         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2551         try:
2552             videoNode = mdoc.findall('./video')[0]
2553             info['description'] = videoNode.findall('./description')[0].text
2554             info['title'] = videoNode.findall('./caption')[0].text
2555             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2556             manifest_url = videoNode.findall('./file')[0].text
2557         except IndexError:
2558             self._downloader.report_error(u'Invalid metadata XML file')
2559             return
2560
2561         manifest_url += '?hdcore=2.10.3'
2562         self.report_manifest(video_id)
2563         try:
2564             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2565         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2567             return
2568
2569         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2570         try:
2571             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2572             node_id = media_node.attrib['url']
2573             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2574         except IndexError as err:
2575             self._downloader.report_error(u'Invalid manifest file')
2576             return
2577
2578         url_pr = compat_urllib_parse_urlparse(manifest_url)
2579         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2580
2581         info['url'] = url
2582         info['ext'] = 'f4f'
2583         return [info]
2584
2585
2586 class XVideosIE(InfoExtractor):
2587     """Information extractor for xvideos.com"""
2588
2589     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2590     IE_NAME = u'xvideos'
2591
2592     def _real_extract(self, url):
2593         mobj = re.match(self._VALID_URL, url)
2594         if mobj is None:
2595             self._downloader.report_error(u'invalid URL: %s' % url)
2596             return
2597         video_id = mobj.group(1)
2598
2599         webpage = self._download_webpage(url, video_id)
2600
2601         self.report_extraction(video_id)
2602
2603
2604         # Extract video URL
2605         mobj = re.search(r'flv_url=(.+?)&', webpage)
2606         if mobj is None:
2607             self._downloader.report_error(u'unable to extract video url')
2608             return
2609         video_url = compat_urllib_parse.unquote(mobj.group(1))
2610
2611
2612         # Extract title
2613         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2614         if mobj is None:
2615             self._downloader.report_error(u'unable to extract video title')
2616             return
2617         video_title = mobj.group(1)
2618
2619
2620         # Extract video thumbnail
2621         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2622         if mobj is None:
2623             self._downloader.report_error(u'unable to extract video thumbnail')
2624             return
2625         video_thumbnail = mobj.group(0)
2626
2627         info = {
2628             'id': video_id,
2629             'url': video_url,
2630             'uploader': None,
2631             'upload_date': None,
2632             'title': video_title,
2633             'ext': 'flv',
2634             'thumbnail': video_thumbnail,
2635             'description': None,
2636         }
2637
2638         return [info]
2639
2640
2641 class SoundcloudIE(InfoExtractor):
2642     """Information extractor for soundcloud.com
2643        To access the media, the uid of the song and a stream token
2644        must be extracted from the page source and the script must make
2645        a request to media.soundcloud.com/crossdomain.xml. Then
2646        the media can be grabbed by requesting from an url composed
2647        of the stream token and uid
2648      """
2649
2650     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2651     IE_NAME = u'soundcloud'
2652
2653     def report_resolve(self, video_id):
2654         """Report information extraction."""
2655         self.to_screen(u'%s: Resolving id' % video_id)
2656
2657     def _real_extract(self, url):
2658         mobj = re.match(self._VALID_URL, url)
2659         if mobj is None:
2660             self._downloader.report_error(u'invalid URL: %s' % url)
2661             return
2662
2663         # extract uploader (which is in the url)
2664         uploader = mobj.group(1)
2665         # extract simple title (uploader + slug of song title)
2666         slug_title =  mobj.group(2)
2667         simple_title = uploader + u'-' + slug_title
2668
2669         self.report_resolve('%s/%s' % (uploader, slug_title))
2670
2671         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2672         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2673         request = compat_urllib_request.Request(resolv_url)
2674         try:
2675             info_json_bytes = compat_urllib_request.urlopen(request).read()
2676             info_json = info_json_bytes.decode('utf-8')
2677         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2678             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2679             return
2680
2681         info = json.loads(info_json)
2682         video_id = info['id']
2683         self.report_extraction('%s/%s' % (uploader, slug_title))
2684
2685         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2686         request = compat_urllib_request.Request(streams_url)
2687         try:
2688             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2689             stream_json = stream_json_bytes.decode('utf-8')
2690         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2692             return
2693
2694         streams = json.loads(stream_json)
2695         mediaURL = streams['http_mp3_128_url']
2696
2697         return [{
2698             'id':       info['id'],
2699             'url':      mediaURL,
2700             'uploader': info['user']['username'],
2701             'upload_date':  info['created_at'],
2702             'title':    info['title'],
2703             'ext':      u'mp3',
2704             'description': info['description'],
2705         }]
2706
2707 class SoundcloudSetIE(InfoExtractor):
2708     """Information extractor for soundcloud.com sets
2709        To access the media, the uid of the song and a stream token
2710        must be extracted from the page source and the script must make
2711        a request to media.soundcloud.com/crossdomain.xml. Then
2712        the media can be grabbed by requesting from an url composed
2713        of the stream token and uid
2714      """
2715
2716     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2717     IE_NAME = u'soundcloud'
2718
2719     def report_resolve(self, video_id):
2720         """Report information extraction."""
2721         self.to_screen(u'%s: Resolving id' % video_id)
2722
2723     def _real_extract(self, url):
2724         mobj = re.match(self._VALID_URL, url)
2725         if mobj is None:
2726             self._downloader.report_error(u'invalid URL: %s' % url)
2727             return
2728
2729         # extract uploader (which is in the url)
2730         uploader = mobj.group(1)
2731         # extract simple title (uploader + slug of song title)
2732         slug_title =  mobj.group(2)
2733         simple_title = uploader + u'-' + slug_title
2734
2735         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2736
2737         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2738         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2739         request = compat_urllib_request.Request(resolv_url)
2740         try:
2741             info_json_bytes = compat_urllib_request.urlopen(request).read()
2742             info_json = info_json_bytes.decode('utf-8')
2743         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2744             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2745             return
2746
2747         videos = []
2748         info = json.loads(info_json)
2749         if 'errors' in info:
2750             for err in info['errors']:
2751                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2752             return
2753
2754         for track in info['tracks']:
2755             video_id = track['id']
2756             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2757
2758             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2759             request = compat_urllib_request.Request(streams_url)
2760             try:
2761                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2762                 stream_json = stream_json_bytes.decode('utf-8')
2763             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2764                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2765                 return
2766
2767             streams = json.loads(stream_json)
2768             mediaURL = streams['http_mp3_128_url']
2769
2770             videos.append({
2771                 'id':       video_id,
2772                 'url':      mediaURL,
2773                 'uploader': track['user']['username'],
2774                 'upload_date':  track['created_at'],
2775                 'title':    track['title'],
2776                 'ext':      u'mp3',
2777                 'description': track['description'],
2778             })
2779         return videos
2780
2781
2782 class InfoQIE(InfoExtractor):
2783     """Information extractor for infoq.com"""
2784     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2785
2786     def _real_extract(self, url):
2787         mobj = re.match(self._VALID_URL, url)
2788         if mobj is None:
2789             self._downloader.report_error(u'invalid URL: %s' % url)
2790             return
2791
2792         webpage = self._download_webpage(url, video_id=url)
2793         self.report_extraction(url)
2794
2795         # Extract video URL
2796         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2797         if mobj is None:
2798             self._downloader.report_error(u'unable to extract video url')
2799             return
2800         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2801         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2802
2803         # Extract title
2804         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2805         if mobj is None:
2806             self._downloader.report_error(u'unable to extract video title')
2807             return
2808         video_title = mobj.group(1)
2809
2810         # Extract description
2811         video_description = u'No description available.'
2812         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2813         if mobj is not None:
2814             video_description = mobj.group(1)
2815
2816         video_filename = video_url.split('/')[-1]
2817         video_id, extension = video_filename.split('.')
2818
2819         info = {
2820             'id': video_id,
2821             'url': video_url,
2822             'uploader': None,
2823             'upload_date': None,
2824             'title': video_title,
2825             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2826             'thumbnail': None,
2827             'description': video_description,
2828         }
2829
2830         return [info]
2831
2832 class MixcloudIE(InfoExtractor):
2833     """Information extractor for www.mixcloud.com"""
2834
2835     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2836     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2837     IE_NAME = u'mixcloud'
2838
2839     def report_download_json(self, file_id):
2840         """Report JSON download."""
2841         self.to_screen(u'Downloading json')
2842
2843     def get_urls(self, jsonData, fmt, bitrate='best'):
2844         """Get urls from 'audio_formats' section in json"""
2845         file_url = None
2846         try:
2847             bitrate_list = jsonData[fmt]
2848             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2849                 bitrate = max(bitrate_list) # select highest
2850
2851             url_list = jsonData[fmt][bitrate]
2852         except TypeError: # we have no bitrate info.
2853             url_list = jsonData[fmt]
2854         return url_list
2855
2856     def check_urls(self, url_list):
2857         """Returns 1st active url from list"""
2858         for url in url_list:
2859             try:
2860                 compat_urllib_request.urlopen(url)
2861                 return url
2862             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2863                 url = None
2864
2865         return None
2866
2867     def _print_formats(self, formats):
2868         print('Available formats:')
2869         for fmt in formats.keys():
2870             for b in formats[fmt]:
2871                 try:
2872                     ext = formats[fmt][b][0]
2873                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2874                 except TypeError: # we have no bitrate info
2875                     ext = formats[fmt][0]
2876                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2877                     break
2878
2879     def _real_extract(self, url):
2880         mobj = re.match(self._VALID_URL, url)
2881         if mobj is None:
2882             self._downloader.report_error(u'invalid URL: %s' % url)
2883             return
2884         # extract uploader & filename from url
2885         uploader = mobj.group(1).decode('utf-8')
2886         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2887
2888         # construct API request
2889         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2890         # retrieve .json file with links to files
2891         request = compat_urllib_request.Request(file_url)
2892         try:
2893             self.report_download_json(file_url)
2894             jsonData = compat_urllib_request.urlopen(request).read()
2895         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2897             return
2898
2899         # parse JSON
2900         json_data = json.loads(jsonData)
2901         player_url = json_data['player_swf_url']
2902         formats = dict(json_data['audio_formats'])
2903
2904         req_format = self._downloader.params.get('format', None)
2905         bitrate = None
2906
2907         if self._downloader.params.get('listformats', None):
2908             self._print_formats(formats)
2909             return
2910
2911         if req_format is None or req_format == 'best':
2912             for format_param in formats.keys():
2913                 url_list = self.get_urls(formats, format_param)
2914                 # check urls
2915                 file_url = self.check_urls(url_list)
2916                 if file_url is not None:
2917                     break # got it!
2918         else:
2919             if req_format not in formats:
2920                 self._downloader.report_error(u'format is not available')
2921                 return
2922
2923             url_list = self.get_urls(formats, req_format)
2924             file_url = self.check_urls(url_list)
2925             format_param = req_format
2926
2927         return [{
2928             'id': file_id.decode('utf-8'),
2929             'url': file_url.decode('utf-8'),
2930             'uploader': uploader.decode('utf-8'),
2931             'upload_date': None,
2932             'title': json_data['name'],
2933             'ext': file_url.split('.')[-1].decode('utf-8'),
2934             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2935             'thumbnail': json_data['thumbnail_url'],
2936             'description': json_data['description'],
2937             'player_url': player_url.decode('utf-8'),
2938         }]
2939
2940 class StanfordOpenClassroomIE(InfoExtractor):
2941     """Information extractor for Stanford's Open ClassRoom"""
2942
2943     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2944     IE_NAME = u'stanfordoc'
2945
2946     def _real_extract(self, url):
2947         mobj = re.match(self._VALID_URL, url)
2948         if mobj is None:
2949             raise ExtractorError(u'Invalid URL: %s' % url)
2950
2951         if mobj.group('course') and mobj.group('video'): # A specific video
2952             course = mobj.group('course')
2953             video = mobj.group('video')
2954             info = {
2955                 'id': course + '_' + video,
2956                 'uploader': None,
2957                 'upload_date': None,
2958             }
2959
2960             self.report_extraction(info['id'])
2961             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2962             xmlUrl = baseUrl + video + '.xml'
2963             try:
2964                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2966                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2967                 return
2968             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2969             try:
2970                 info['title'] = mdoc.findall('./title')[0].text
2971                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2972             except IndexError:
2973                 self._downloader.report_error(u'Invalid metadata XML file')
2974                 return
2975             info['ext'] = info['url'].rpartition('.')[2]
2976             return [info]
2977         elif mobj.group('course'): # A course page
2978             course = mobj.group('course')
2979             info = {
2980                 'id': course,
2981                 'type': 'playlist',
2982                 'uploader': None,
2983                 'upload_date': None,
2984             }
2985
2986             coursepage = self._download_webpage(url, info['id'],
2987                                         note='Downloading course info page',
2988                                         errnote='Unable to download course info page')
2989
2990             m = re.search('<h1>([^<]+)</h1>', coursepage)
2991             if m:
2992                 info['title'] = unescapeHTML(m.group(1))
2993             else:
2994                 info['title'] = info['id']
2995
2996             m = re.search('<description>([^<]+)</description>', coursepage)
2997             if m:
2998                 info['description'] = unescapeHTML(m.group(1))
2999
3000             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3001             info['list'] = [
3002                 {
3003                     'type': 'reference',
3004                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3005                 }
3006                     for vpage in links]
3007             results = []
3008             for entry in info['list']:
3009                 assert entry['type'] == 'reference'
3010                 results += self.extract(entry['url'])
3011             return results
3012         else: # Root page
3013             info = {
3014                 'id': 'Stanford OpenClassroom',
3015                 'type': 'playlist',
3016                 'uploader': None,
3017                 'upload_date': None,
3018             }
3019
3020             self.report_download_webpage(info['id'])
3021             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3022             try:
3023                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3024             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3026                 return
3027
3028             info['title'] = info['id']
3029
3030             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3031             info['list'] = [
3032                 {
3033                     'type': 'reference',
3034                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3035                 }
3036                     for cpage in links]
3037
3038             results = []
3039             for entry in info['list']:
3040                 assert entry['type'] == 'reference'
3041                 results += self.extract(entry['url'])
3042             return results
3043
3044 class MTVIE(InfoExtractor):
3045     """Information extractor for MTV.com"""
3046
3047     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3048     IE_NAME = u'mtv'
3049
3050     def _real_extract(self, url):
3051         mobj = re.match(self._VALID_URL, url)
3052         if mobj is None:
3053             self._downloader.report_error(u'invalid URL: %s' % url)
3054             return
3055         if not mobj.group('proto'):
3056             url = 'http://' + url
3057         video_id = mobj.group('videoid')
3058
3059         webpage = self._download_webpage(url, video_id)
3060
3061         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062         if mobj is None:
3063             self._downloader.report_error(u'unable to extract song name')
3064             return
3065         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067         if mobj is None:
3068             self._downloader.report_error(u'unable to extract performer')
3069             return
3070         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071         video_title = performer + ' - ' + song_name
3072
3073         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074         if mobj is None:
3075             self._downloader.report_error(u'unable to mtvn_uri')
3076             return
3077         mtvn_uri = mobj.group(1)
3078
3079         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080         if mobj is None:
3081             self._downloader.report_error(u'unable to extract content id')
3082             return
3083         content_id = mobj.group(1)
3084
3085         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086         self.report_extraction(video_id)
3087         request = compat_urllib_request.Request(videogen_url)
3088         try:
3089             metadataXml = compat_urllib_request.urlopen(request).read()
3090         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3092             return
3093
3094         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095         renditions = mdoc.findall('.//rendition')
3096
3097         # For now, always pick the highest quality.
3098         rendition = renditions[-1]
3099
3100         try:
3101             _,_,ext = rendition.attrib['type'].partition('/')
3102             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103             video_url = rendition.find('./src').text
3104         except KeyError:
3105             self._downloader.report_error('Invalid rendition field.')
3106             return
3107
3108         info = {
3109             'id': video_id,
3110             'url': video_url,
3111             'uploader': performer,
3112             'upload_date': None,
3113             'title': video_title,
3114             'ext': ext,
3115             'format': format,
3116         }
3117
3118         return [info]
3119
3120
3121 class YoukuIE(InfoExtractor):
3122     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123
3124     def _gen_sid(self):
3125         nowTime = int(time.time() * 1000)
3126         random1 = random.randint(1000,1998)
3127         random2 = random.randint(1000,9999)
3128
3129         return "%d%d%d" %(nowTime,random1,random2)
3130
3131     def _get_file_ID_mix_string(self, seed):
3132         mixed = []
3133         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3134         seed = float(seed)
3135         for i in range(len(source)):
3136             seed  =  (seed * 211 + 30031 ) % 65536
3137             index  =  math.floor(seed / 65536 * len(source) )
3138             mixed.append(source[int(index)])
3139             source.remove(source[int(index)])
3140         #return ''.join(mixed)
3141         return mixed
3142
3143     def _get_file_id(self, fileId, seed):
3144         mixed = self._get_file_ID_mix_string(seed)
3145         ids = fileId.split('*')
3146         realId = []
3147         for ch in ids:
3148             if ch:
3149                 realId.append(mixed[int(ch)])
3150         return ''.join(realId)
3151
3152     def _real_extract(self, url):
3153         mobj = re.match(self._VALID_URL, url)
3154         if mobj is None:
3155             self._downloader.report_error(u'invalid URL: %s' % url)
3156             return
3157         video_id = mobj.group('ID')
3158
3159         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3160
3161         request = compat_urllib_request.Request(info_url, None, std_headers)
3162         try:
3163             self.report_download_webpage(video_id)
3164             jsondata = compat_urllib_request.urlopen(request).read()
3165         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3167             return
3168
3169         self.report_extraction(video_id)
3170         try:
3171             jsonstr = jsondata.decode('utf-8')
3172             config = json.loads(jsonstr)
3173
3174             video_title =  config['data'][0]['title']
3175             seed = config['data'][0]['seed']
3176
3177             format = self._downloader.params.get('format', None)
3178             supported_format = list(config['data'][0]['streamfileids'].keys())
3179
3180             if format is None or format == 'best':
3181                 if 'hd2' in supported_format:
3182                     format = 'hd2'
3183                 else:
3184                     format = 'flv'
3185                 ext = u'flv'
3186             elif format == 'worst':
3187                 format = 'mp4'
3188                 ext = u'mp4'
3189             else:
3190                 format = 'flv'
3191                 ext = u'flv'
3192
3193
3194             fileid = config['data'][0]['streamfileids'][format]
3195             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3196         except (UnicodeDecodeError, ValueError, KeyError):
3197             self._downloader.report_error(u'unable to extract info section')
3198             return
3199
3200         files_info=[]
3201         sid = self._gen_sid()
3202         fileid = self._get_file_id(fileid, seed)
3203
3204         #column 8,9 of fileid represent the segment number
3205         #fileid[7:9] should be changed
3206         for index, key in enumerate(keys):
3207
3208             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3209             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3210
3211             info = {
3212                 'id': '%s_part%02d' % (video_id, index),
3213                 'url': download_url,
3214                 'uploader': None,
3215                 'upload_date': None,
3216                 'title': video_title,
3217                 'ext': ext,
3218             }
3219             files_info.append(info)
3220
3221         return files_info
3222
3223
3224 class XNXXIE(InfoExtractor):
3225     """Information extractor for xnxx.com"""
3226
3227     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3228     IE_NAME = u'xnxx'
3229     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3230     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3231     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3232
3233     def _real_extract(self, url):
3234         mobj = re.match(self._VALID_URL, url)
3235         if mobj is None:
3236             self._downloader.report_error(u'invalid URL: %s' % url)
3237             return
3238         video_id = mobj.group(1)
3239
3240         self.report_download_webpage(video_id)
3241
3242         # Get webpage content
3243         try:
3244             webpage_bytes = compat_urllib_request.urlopen(url).read()
3245             webpage = webpage_bytes.decode('utf-8')
3246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3247             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3248             return
3249
3250         result = re.search(self.VIDEO_URL_RE, webpage)
3251         if result is None:
3252             self._downloader.report_error(u'unable to extract video url')
3253             return
3254         video_url = compat_urllib_parse.unquote(result.group(1))
3255
3256         result = re.search(self.VIDEO_TITLE_RE, webpage)
3257         if result is None:
3258             self._downloader.report_error(u'unable to extract video title')
3259             return
3260         video_title = result.group(1)
3261
3262         result = re.search(self.VIDEO_THUMB_RE, webpage)
3263         if result is None:
3264             self._downloader.report_error(u'unable to extract video thumbnail')
3265             return
3266         video_thumbnail = result.group(1)
3267
3268         return [{
3269             'id': video_id,
3270             'url': video_url,
3271             'uploader': None,
3272             'upload_date': None,
3273             'title': video_title,
3274             'ext': 'flv',
3275             'thumbnail': video_thumbnail,
3276             'description': None,
3277         }]
3278
3279
3280 class GooglePlusIE(InfoExtractor):
3281     """Information extractor for plus.google.com."""
3282
3283     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3284     IE_NAME = u'plus.google'
3285
3286     def report_extract_entry(self, url):
3287         """Report downloading extry"""
3288         self.to_screen(u'Downloading entry: %s' % url)
3289
3290     def report_date(self, upload_date):
3291         """Report downloading extry"""
3292         self.to_screen(u'Entry date: %s' % upload_date)
3293
3294     def report_uploader(self, uploader):
3295         """Report downloading extry"""
3296         self.to_screen(u'Uploader: %s' % uploader)
3297
3298     def report_title(self, video_title):
3299         """Report downloading extry"""
3300         self.to_screen(u'Title: %s' % video_title)
3301
3302     def report_extract_vid_page(self, video_page):
3303         """Report information extraction."""
3304         self.to_screen(u'Extracting video page: %s' % video_page)
3305
3306     def _real_extract(self, url):
3307         # Extract id from URL
3308         mobj = re.match(self._VALID_URL, url)
3309         if mobj is None:
3310             self._downloader.report_error(u'Invalid URL: %s' % url)
3311             return
3312
3313         post_url = mobj.group(0)
3314         video_id = mobj.group(1)
3315
3316         video_extension = 'flv'
3317
3318         # Step 1, Retrieve post webpage to extract further information
3319         self.report_extract_entry(post_url)
3320         request = compat_urllib_request.Request(post_url)
3321         try:
3322             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3323         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3324             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3325             return
3326
3327         # Extract update date
3328         upload_date = None
3329         pattern = 'title="Timestamp">(.*?)</a>'
3330         mobj = re.search(pattern, webpage)
3331         if mobj:
3332             upload_date = mobj.group(1)
3333             # Convert timestring to a format suitable for filename
3334             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3335             upload_date = upload_date.strftime('%Y%m%d')
3336         self.report_date(upload_date)
3337
3338         # Extract uploader
3339         uploader = None
3340         pattern = r'rel\="author".*?>(.*?)</a>'
3341         mobj = re.search(pattern, webpage)
3342         if mobj:
3343             uploader = mobj.group(1)
3344         self.report_uploader(uploader)
3345
3346         # Extract title
3347         # Get the first line for title
3348         video_title = u'NA'
3349         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3350         mobj = re.search(pattern, webpage)
3351         if mobj:
3352             video_title = mobj.group(1)
3353         self.report_title(video_title)
3354
3355         # Step 2, Stimulate clicking the image box to launch video
3356         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3357         mobj = re.search(pattern, webpage)
3358         if mobj is None:
3359             self._downloader.report_error(u'unable to extract video page URL')
3360
3361         video_page = mobj.group(1)
3362         request = compat_urllib_request.Request(video_page)
3363         try:
3364             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3365         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3367             return
3368         self.report_extract_vid_page(video_page)
3369
3370
3371         # Extract video links on video page
3372         """Extract video links of all sizes"""
3373         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3374         mobj = re.findall(pattern, webpage)
3375         if len(mobj) == 0:
3376             self._downloader.report_error(u'unable to extract video links')
3377
3378         # Sort in resolution
3379         links = sorted(mobj)
3380
3381         # Choose the lowest of the sort, i.e. highest resolution
3382         video_url = links[-1]
3383         # Only get the url. The resolution part in the tuple has no use anymore
3384         video_url = video_url[-1]
3385         # Treat escaped \u0026 style hex
3386         try:
3387             video_url = video_url.decode("unicode_escape")
3388         except AttributeError: # Python 3
3389             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3390
3391
3392         return [{
3393             'id':       video_id,
3394             'url':      video_url,
3395             'uploader': uploader,
3396             'upload_date':  upload_date,
3397             'title':    video_title,
3398             'ext':      video_extension,
3399         }]
3400
3401 class NBAIE(InfoExtractor):
3402     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3403     IE_NAME = u'nba'
3404
3405     def _real_extract(self, url):
3406         mobj = re.match(self._VALID_URL, url)
3407         if mobj is None:
3408             self._downloader.report_error(u'invalid URL: %s' % url)
3409             return
3410
3411         video_id = mobj.group(1)
3412         if video_id.endswith('/index.html'):
3413             video_id = video_id[:-len('/index.html')]
3414
3415         webpage = self._download_webpage(url, video_id)
3416
3417         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3418         def _findProp(rexp, default=None):
3419             m = re.search(rexp, webpage)
3420             if m:
3421                 return unescapeHTML(m.group(1))
3422             else:
3423                 return default
3424
3425         shortened_video_id = video_id.rpartition('/')[2]
3426         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3427         info = {
3428             'id': shortened_video_id,
3429             'url': video_url,
3430             'ext': 'mp4',
3431             'title': title,
3432             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3433             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3434         }
3435         return [info]
3436
3437 class JustinTVIE(InfoExtractor):
3438     """Information extractor for justin.tv and twitch.tv"""
3439     # TODO: One broadcast may be split into multiple videos. The key
3440     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3441     # starts at 1 and increases. Can we treat all parts as one video?
3442
3443     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3444         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3445     _JUSTIN_PAGE_LIMIT = 100
3446     IE_NAME = u'justin.tv'
3447
3448     def report_download_page(self, channel, offset):
3449         """Report attempt to download a single page of videos."""
3450         self.to_screen(u'%s: Downloading video information from %d to %d' %
3451                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3452
3453     # Return count of items, list of *valid* items
3454     def _parse_page(self, url):
3455         try:
3456             urlh = compat_urllib_request.urlopen(url)
3457             webpage_bytes = urlh.read()
3458             webpage = webpage_bytes.decode('utf-8', 'ignore')
3459         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3460             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3461             return
3462
3463         response = json.loads(webpage)
3464         if type(response) != list:
3465             error_text = response.get('error', 'unknown error')
3466             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3467             return
3468         info = []
3469         for clip in response:
3470             video_url = clip['video_file_url']
3471             if video_url:
3472                 video_extension = os.path.splitext(video_url)[1][1:]
3473                 video_date = re.sub('-', '', clip['start_time'][:10])
3474                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3475                 video_id = clip['id']
3476                 video_title = clip.get('title', video_id)
3477                 info.append({
3478                     'id': video_id,
3479                     'url': video_url,
3480                     'title': video_title,
3481                     'uploader': clip.get('channel_name', video_uploader_id),
3482                     'uploader_id': video_uploader_id,
3483                     'upload_date': video_date,
3484                     'ext': video_extension,
3485                 })
3486         return (len(response), info)
3487
3488     def _real_extract(self, url):
3489         mobj = re.match(self._VALID_URL, url)
3490         if mobj is None:
3491             self._downloader.report_error(u'invalid URL: %s' % url)
3492             return
3493
3494         api = 'http://api.justin.tv'
3495         video_id = mobj.group(mobj.lastindex)
3496         paged = False
3497         if mobj.lastindex == 1:
3498             paged = True
3499             api += '/channel/archives/%s.json'
3500         else:
3501             api += '/broadcast/by_archive/%s.json'
3502         api = api % (video_id,)
3503
3504         self.report_extraction(video_id)
3505
3506         info = []
3507         offset = 0
3508         limit = self._JUSTIN_PAGE_LIMIT
3509         while True:
3510             if paged:
3511                 self.report_download_page(video_id, offset)
3512             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3513             page_count, page_info = self._parse_page(page_url)
3514             info.extend(page_info)
3515             if not paged or page_count != limit:
3516                 break
3517             offset += limit
3518         return info
3519
3520 class FunnyOrDieIE(InfoExtractor):
3521     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3522
3523     def _real_extract(self, url):
3524         mobj = re.match(self._VALID_URL, url)
3525         if mobj is None:
3526             self._downloader.report_error(u'invalid URL: %s' % url)
3527             return
3528
3529         video_id = mobj.group('id')
3530         webpage = self._download_webpage(url, video_id)
3531
3532         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3533         if not m:
3534             self._downloader.report_error(u'unable to find video information')
3535         video_url = unescapeHTML(m.group('url'))
3536
3537         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3538         if not m:
3539             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3540             if not m:
3541                 self._downloader.report_error(u'Cannot find video title')
3542         title = clean_html(m.group('title'))
3543
3544         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3545         if m:
3546             desc = unescapeHTML(m.group('desc'))
3547         else:
3548             desc = None
3549
3550         info = {
3551             'id': video_id,
3552             'url': video_url,
3553             'ext': 'mp4',
3554             'title': title,
3555             'description': desc,
3556         }
3557         return [info]
3558
3559 class SteamIE(InfoExtractor):
3560     _VALID_URL = r"""http://store.steampowered.com/
3561                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3562                 (?P<gameID>\d+)/?
3563                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3564                 """
3565
3566     @classmethod
3567     def suitable(cls, url):
3568         """Receives a URL and returns True if suitable for this IE."""
3569         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3570
3571     def _real_extract(self, url):
3572         m = re.match(self._VALID_URL, url, re.VERBOSE)
3573         gameID = m.group('gameID')
3574         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3575         self.report_age_confirmation()
3576         webpage = self._download_webpage(videourl, gameID)
3577         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3578         
3579         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3580         mweb = re.finditer(urlRE, webpage)
3581         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3582         titles = re.finditer(namesRE, webpage)
3583         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3584         thumbs = re.finditer(thumbsRE, webpage)
3585         videos = []
3586         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3587             video_id = vid.group('videoID')
3588             title = vtitle.group('videoName')
3589             video_url = vid.group('videoURL')
3590             video_thumb = thumb.group('thumbnail')
3591             if not video_url:
3592                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3593             info = {
3594                 'id':video_id,
3595                 'url':video_url,
3596                 'ext': 'flv',
3597                 'title': unescapeHTML(title),
3598                 'thumbnail': video_thumb
3599                   }
3600             videos.append(info)
3601         return [self.playlist_result(videos, gameID, game_title)]
3602
3603 class UstreamIE(InfoExtractor):
3604     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3605     IE_NAME = u'ustream'
3606
3607     def _real_extract(self, url):
3608         m = re.match(self._VALID_URL, url)
3609         video_id = m.group('videoID')
3610         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3611         webpage = self._download_webpage(url, video_id)
3612         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3613         title = m.group('title')
3614         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3615         uploader = m.group('uploader')
3616         info = {
3617                 'id':video_id,
3618                 'url':video_url,
3619                 'ext': 'flv',
3620                 'title': title,
3621                 'uploader': uploader
3622                   }
3623         return [info]
3624
3625 class WorldStarHipHopIE(InfoExtractor):
3626     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3627     IE_NAME = u'WorldStarHipHop'
3628
3629     def _real_extract(self, url):
3630         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3631
3632         webpage_src = compat_urllib_request.urlopen(url).read()
3633         webpage_src = webpage_src.decode('utf-8')
3634
3635         mobj = re.search(_src_url, webpage_src)
3636
3637         m = re.match(self._VALID_URL, url)
3638         video_id = m.group('id')
3639
3640         if mobj is not None:
3641             video_url = mobj.group()
3642             if 'mp4' in video_url:
3643                 ext = 'mp4'
3644             else:
3645                 ext = 'flv'
3646         else:
3647             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3648             return
3649
3650         _title = r"""<title>(.*)</title>"""
3651
3652         mobj = re.search(_title, webpage_src)
3653
3654         if mobj is not None:
3655             title = mobj.group(1)
3656         else:
3657             title = 'World Start Hip Hop - %s' % time.ctime()
3658
3659         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3660         mobj = re.search(_thumbnail, webpage_src)
3661
3662         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3663         if mobj is not None:
3664             thumbnail = mobj.group(1)
3665         else:
3666             _title = r"""candytitles.*>(.*)</span>"""
3667             mobj = re.search(_title, webpage_src)
3668             if mobj is not None:
3669                 title = mobj.group(1)
3670             thumbnail = None
3671
3672         results = [{
3673                     'id': video_id,
3674                     'url' : video_url,
3675                     'title' : title,
3676                     'thumbnail' : thumbnail,
3677                     'ext' : ext,
3678                     }]
3679         return results
3680
3681 class RBMARadioIE(InfoExtractor):
3682     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3683
3684     def _real_extract(self, url):
3685         m = re.match(self._VALID_URL, url)
3686         video_id = m.group('videoID')
3687
3688         webpage = self._download_webpage(url, video_id)
3689         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3690         if not m:
3691             raise ExtractorError(u'Cannot find metadata')
3692         json_data = m.group(1)
3693
3694         try:
3695             data = json.loads(json_data)
3696         except ValueError as e:
3697             raise ExtractorError(u'Invalid JSON: ' + str(e))
3698
3699         video_url = data['akamai_url'] + '&cbr=256'
3700         url_parts = compat_urllib_parse_urlparse(video_url)
3701         video_ext = url_parts.path.rpartition('.')[2]
3702         info = {
3703                 'id': video_id,
3704                 'url': video_url,
3705                 'ext': video_ext,
3706                 'title': data['title'],
3707                 'description': data.get('teaser_text'),
3708                 'location': data.get('country_of_origin'),
3709                 'uploader': data.get('host', {}).get('name'),
3710                 'uploader_id': data.get('host', {}).get('slug'),
3711                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3712                 'duration': data.get('duration'),
3713         }
3714         return [info]
3715
3716
3717 class YouPornIE(InfoExtractor):
3718     """Information extractor for youporn.com."""
3719     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3720
3721     def _print_formats(self, formats):
3722         """Print all available formats"""
3723         print(u'Available formats:')
3724         print(u'ext\t\tformat')
3725         print(u'---------------------------------')
3726         for format in formats:
3727             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3728
3729     def _specific(self, req_format, formats):
3730         for x in formats:
3731             if(x["format"]==req_format):
3732                 return x
3733         return None
3734
3735     def _real_extract(self, url):
3736         mobj = re.match(self._VALID_URL, url)
3737         if mobj is None:
3738             self._downloader.report_error(u'invalid URL: %s' % url)
3739             return
3740
3741         video_id = mobj.group('videoid')
3742
3743         req = compat_urllib_request.Request(url)
3744         req.add_header('Cookie', 'age_verified=1')
3745         webpage = self._download_webpage(req, video_id)
3746
3747         # Get the video title
3748         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3749         if result is None:
3750             raise ExtractorError(u'Unable to extract video title')
3751         video_title = result.group('title').strip()
3752
3753         # Get the video date
3754         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3755         if result is None:
3756             self._downloader.report_warning(u'unable to extract video date')
3757             upload_date = None
3758         else:
3759             upload_date = result.group('date').strip()
3760
3761         # Get the video uploader
3762         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3763         if result is None:
3764             self._downloader.report_warning(u'unable to extract uploader')
3765             video_uploader = None
3766         else:
3767             video_uploader = result.group('uploader').strip()
3768             video_uploader = clean_html( video_uploader )
3769
3770         # Get all of the formats available
3771         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3772         result = re.search(DOWNLOAD_LIST_RE, webpage)
3773         if result is None:
3774             raise ExtractorError(u'Unable to extract download list')
3775         download_list_html = result.group('download_list').strip()
3776
3777         # Get all of the links from the page
3778         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3779         links = re.findall(LINK_RE, download_list_html)
3780         if(len(links) == 0):
3781             raise ExtractorError(u'ERROR: no known formats available for video')
3782
3783         self.to_screen(u'Links found: %d' % len(links))
3784
3785         formats = []
3786         for link in links:
3787
3788             # A link looks like this:
3789             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3790             # A path looks like this:
3791             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3792             video_url = unescapeHTML( link )
3793             path = compat_urllib_parse_urlparse( video_url ).path
3794             extension = os.path.splitext( path )[1][1:]
3795             format = path.split('/')[4].split('_')[:2]
3796             size = format[0]
3797             bitrate = format[1]
3798             format = "-".join( format )
3799             title = u'%s-%s-%s' % (video_title, size, bitrate)
3800
3801             formats.append({
3802                 'id': video_id,
3803                 'url': video_url,
3804                 'uploader': video_uploader,
3805                 'upload_date': upload_date,
3806                 'title': title,
3807                 'ext': extension,
3808                 'format': format,
3809                 'thumbnail': None,
3810                 'description': None,
3811                 'player_url': None
3812             })
3813
3814         if self._downloader.params.get('listformats', None):
3815             self._print_formats(formats)
3816             return
3817
3818         req_format = self._downloader.params.get('format', None)
3819         self.to_screen(u'Format: %s' % req_format)
3820
3821         if req_format is None or req_format == 'best':
3822             return [formats[0]]
3823         elif req_format == 'worst':
3824             return [formats[-1]]
3825         elif req_format in ('-1', 'all'):
3826             return formats
3827         else:
3828             format = self._specific( req_format, formats )
3829             if result is None:
3830                 self._downloader.report_error(u'requested format not available')
3831                 return
3832             return [format]
3833
3834
3835
3836 class PornotubeIE(InfoExtractor):
3837     """Information extractor for pornotube.com."""
3838     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3839
3840     def _real_extract(self, url):
3841         mobj = re.match(self._VALID_URL, url)
3842         if mobj is None:
3843             self._downloader.report_error(u'invalid URL: %s' % url)
3844             return
3845
3846         video_id = mobj.group('videoid')
3847         video_title = mobj.group('title')
3848
3849         # Get webpage content
3850         webpage = self._download_webpage(url, video_id)
3851
3852         # Get the video URL
3853         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3854         result = re.search(VIDEO_URL_RE, webpage)
3855         if result is None:
3856             self._downloader.report_error(u'unable to extract video url')
3857             return
3858         video_url = compat_urllib_parse.unquote(result.group('url'))
3859
3860         #Get the uploaded date
3861         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3862         result = re.search(VIDEO_UPLOADED_RE, webpage)
3863         if result is None:
3864             self._downloader.report_error(u'unable to extract video title')
3865             return
3866         upload_date = result.group('date')
3867
3868         info = {'id': video_id,
3869                 'url': video_url,
3870                 'uploader': None,
3871                 'upload_date': upload_date,
3872                 'title': video_title,
3873                 'ext': 'flv',
3874                 'format': 'flv'}
3875
3876         return [info]
3877
3878 class YouJizzIE(InfoExtractor):
3879     """Information extractor for youjizz.com."""
3880     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3881
3882     def _real_extract(self, url):
3883         mobj = re.match(self._VALID_URL, url)
3884         if mobj is None:
3885             self._downloader.report_error(u'invalid URL: %s' % url)
3886             return
3887
3888         video_id = mobj.group('videoid')
3889
3890         # Get webpage content
3891         webpage = self._download_webpage(url, video_id)
3892
3893         # Get the video title
3894         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3895         if result is None:
3896             raise ExtractorError(u'ERROR: unable to extract video title')
3897         video_title = result.group('title').strip()
3898
3899         # Get the embed page
3900         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3901         if result is None:
3902             raise ExtractorError(u'ERROR: unable to extract embed page')
3903
3904         embed_page_url = result.group(0).strip()
3905         video_id = result.group('videoid')
3906
3907         webpage = self._download_webpage(embed_page_url, video_id)
3908
3909         # Get the video URL
3910         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3911         if result is None:
3912             raise ExtractorError(u'ERROR: unable to extract video url')
3913         video_url = result.group('source')
3914
3915         info = {'id': video_id,
3916                 'url': video_url,
3917                 'title': video_title,
3918                 'ext': 'flv',
3919                 'format': 'flv',
3920                 'player_url': embed_page_url}
3921
3922         return [info]
3923
3924 class EightTracksIE(InfoExtractor):
3925     IE_NAME = '8tracks'
3926     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3927
3928     def _real_extract(self, url):
3929         mobj = re.match(self._VALID_URL, url)
3930         if mobj is None:
3931             raise ExtractorError(u'Invalid URL: %s' % url)
3932         playlist_id = mobj.group('id')
3933
3934         webpage = self._download_webpage(url, playlist_id)
3935
3936         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3937         if not m:
3938             raise ExtractorError(u'Cannot find trax information')
3939         json_like = m.group(1)
3940         data = json.loads(json_like)
3941
3942         session = str(random.randint(0, 1000000000))
3943         mix_id = data['id']
3944         track_count = data['tracks_count']
3945         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3946         next_url = first_url
3947         res = []
3948         for i in itertools.count():
3949             api_json = self._download_webpage(next_url, playlist_id,
3950                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3951                 errnote=u'Failed to download song information')
3952             api_data = json.loads(api_json)
3953             track_data = api_data[u'set']['track']
3954             info = {
3955                 'id': track_data['id'],
3956                 'url': track_data['track_file_stream_url'],
3957                 'title': track_data['performer'] + u' - ' + track_data['name'],
3958                 'raw_title': track_data['name'],
3959                 'uploader_id': data['user']['login'],
3960                 'ext': 'm4a',
3961             }
3962             res.append(info)
3963             if api_data['set']['at_last_track']:
3964                 break
3965             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3966         return res
3967
3968 class KeekIE(InfoExtractor):
3969     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3970     IE_NAME = u'keek'
3971
3972     def _real_extract(self, url):
3973         m = re.match(self._VALID_URL, url)
3974         video_id = m.group('videoID')
3975         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3976         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3977         webpage = self._download_webpage(url, video_id)
3978         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3979         title = unescapeHTML(m.group('title'))
3980         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3981         uploader = clean_html(m.group('uploader'))
3982         info = {
3983                 'id': video_id,
3984                 'url': video_url,
3985                 'ext': 'mp4',
3986                 'title': title,
3987                 'thumbnail': thumbnail,
3988                 'uploader': uploader
3989         }
3990         return [info]
3991
3992 class TEDIE(InfoExtractor):
3993     _VALID_URL=r'''http://www.ted.com/
3994                    (
3995                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3996                         |
3997                         ((?P<type_talk>talks)) # We have a simple talk
3998                    )
3999                    /(?P<name>\w+) # Here goes the name and then ".html"
4000                    '''
4001
4002     @classmethod
4003     def suitable(cls, url):
4004         """Receives a URL and returns True if suitable for this IE."""
4005         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4006
4007     def _real_extract(self, url):
4008         m=re.match(self._VALID_URL, url, re.VERBOSE)
4009         if m.group('type_talk'):
4010             return [self._talk_info(url)]
4011         else :
4012             playlist_id=m.group('playlist_id')
4013             name=m.group('name')
4014             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4015             return [self._playlist_videos_info(url,name,playlist_id)]
4016
4017     def _talk_video_link(self,mediaSlug):
4018         '''Returns the video link for that mediaSlug'''
4019         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4020
4021     def _playlist_videos_info(self,url,name,playlist_id=0):
4022         '''Returns the videos of the playlist'''
4023         video_RE=r'''
4024                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4025                      ([.\s]*?)data-playlist_item_id="(\d+)"
4026                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4027                      '''
4028         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4029         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4030         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4031         m_names=re.finditer(video_name_RE,webpage)
4032
4033         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4034         m_playlist = re.search(playlist_RE, webpage)
4035         playlist_title = m_playlist.group('playlist_title')
4036
4037         playlist_entries = []
4038         for m_video, m_name in zip(m_videos,m_names):
4039             video_id=m_video.group('video_id')
4040             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4041             playlist_entries.append(self.url_result(talk_url, 'TED'))
4042         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4043
4044     def _talk_info(self, url, video_id=0):
4045         """Return the video for the talk in the url"""
4046         m=re.match(self._VALID_URL, url,re.VERBOSE)
4047         videoName=m.group('name')
4048         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4049         # If the url includes the language we get the title translated
4050         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4051         title=re.search(title_RE, webpage).group('title')
4052         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4053                         "id":(?P<videoID>[\d]+).*?
4054                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4055         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4056         thumb_match=re.search(thumb_RE,webpage)
4057         info_match=re.search(info_RE,webpage,re.VERBOSE)
4058         video_id=info_match.group('videoID')
4059         mediaSlug=info_match.group('mediaSlug')
4060         video_url=self._talk_video_link(mediaSlug)
4061         info = {
4062                 'id': video_id,
4063                 'url': video_url,
4064                 'ext': 'mp4',
4065                 'title': title,
4066                 'thumbnail': thumb_match.group('thumbnail')
4067                 }
4068         return info
4069
4070 class MySpassIE(InfoExtractor):
4071     _VALID_URL = r'http://www.myspass.de/.*'
4072
4073     def _real_extract(self, url):
4074         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4075
4076         # video id is the last path element of the URL
4077         # usually there is a trailing slash, so also try the second but last
4078         url_path = compat_urllib_parse_urlparse(url).path
4079         url_parent_path, video_id = os.path.split(url_path)
4080         if not video_id:
4081             _, video_id = os.path.split(url_parent_path)
4082
4083         # get metadata
4084         metadata_url = META_DATA_URL_TEMPLATE % video_id
4085         metadata_text = self._download_webpage(metadata_url, video_id)
4086         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4087
4088         # extract values from metadata
4089         url_flv_el = metadata.find('url_flv')
4090         if url_flv_el is None:
4091             self._downloader.report_error(u'unable to extract download url')
4092             return
4093         video_url = url_flv_el.text
4094         extension = os.path.splitext(video_url)[1][1:]
4095         title_el = metadata.find('title')
4096         if title_el is None:
4097             self._downloader.report_error(u'unable to extract title')
4098             return
4099         title = title_el.text
4100         format_id_el = metadata.find('format_id')
4101         if format_id_el is None:
4102             format = ext
4103         else:
4104             format = format_id_el.text
4105         description_el = metadata.find('description')
4106         if description_el is not None:
4107             description = description_el.text
4108         else:
4109             description = None
4110         imagePreview_el = metadata.find('imagePreview')
4111         if imagePreview_el is not None:
4112             thumbnail = imagePreview_el.text
4113         else:
4114             thumbnail = None
4115         info = {
4116             'id': video_id,
4117             'url': video_url,
4118             'title': title,
4119             'ext': extension,
4120             'format': format,
4121             'thumbnail': thumbnail,
4122             'description': description
4123         }
4124         return [info]
4125
4126 class SpiegelIE(InfoExtractor):
4127     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4128
4129     def _real_extract(self, url):
4130         m = re.match(self._VALID_URL, url)
4131         video_id = m.group('videoID')
4132
4133         webpage = self._download_webpage(url, video_id)
4134         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4135         if not m:
4136             raise ExtractorError(u'Cannot find title')
4137         video_title = unescapeHTML(m.group(1))
4138
4139         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4140         xml_code = self._download_webpage(xml_url, video_id,
4141                     note=u'Downloading XML', errnote=u'Failed to download XML')
4142
4143         idoc = xml.etree.ElementTree.fromstring(xml_code)
4144         last_type = idoc[-1]
4145         filename = last_type.findall('./filename')[0].text
4146         duration = float(last_type.findall('./duration')[0].text)
4147
4148         video_url = 'http://video2.spiegel.de/flash/' + filename
4149         video_ext = filename.rpartition('.')[2]
4150         info = {
4151             'id': video_id,
4152             'url': video_url,
4153             'ext': video_ext,
4154             'title': video_title,
4155             'duration': duration,
4156         }
4157         return [info]
4158
4159 class LiveLeakIE(InfoExtractor):
4160
4161     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4162     IE_NAME = u'liveleak'
4163
4164     def _real_extract(self, url):
4165         mobj = re.match(self._VALID_URL, url)
4166         if mobj is None:
4167             self._downloader.report_error(u'invalid URL: %s' % url)
4168             return
4169
4170         video_id = mobj.group('video_id')
4171
4172         webpage = self._download_webpage(url, video_id)
4173
4174         m = re.search(r'file: "(.*?)",', webpage)
4175         if not m:
4176             self._downloader.report_error(u'unable to find video url')
4177             return
4178         video_url = m.group(1)
4179
4180         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4181         if not m:
4182             self._downloader.report_error(u'Cannot find video title')
4183         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4184
4185         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4186         if m:
4187             desc = unescapeHTML(m.group('desc'))
4188         else:
4189             desc = None
4190
4191         m = re.search(r'By:.*?(\w+)</a>', webpage)
4192         if m:
4193             uploader = clean_html(m.group(1))
4194         else:
4195             uploader = None
4196
4197         info = {
4198             'id':  video_id,
4199             'url': video_url,
4200             'ext': 'mp4',
4201             'title': title,
4202             'description': desc,
4203             'uploader': uploader
4204         }
4205
4206         return [info]
4207
4208 class ARDIE(InfoExtractor):
4209     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4210     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4211     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4212
4213     def _real_extract(self, url):
4214         # determine video id from url
4215         m = re.match(self._VALID_URL, url)
4216
4217         numid = re.search(r'documentId=([0-9]+)', url)
4218         if numid:
4219             video_id = numid.group(1)
4220         else:
4221             video_id = m.group('video_id')
4222
4223         # determine title and media streams from webpage
4224         html = self._download_webpage(url, video_id)
4225         title = re.search(self._TITLE, html).group('title')
4226         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4227         if not streams:
4228             assert '"fsk"' in html
4229             self._downloader.report_error(u'this video is only available after 8:00 pm')
4230             return
4231
4232         # choose default media type and highest quality for now
4233         stream = max([s for s in streams if int(s["media_type"]) == 0],
4234                      key=lambda s: int(s["quality"]))
4235
4236         # there's two possibilities: RTMP stream or HTTP download
4237         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4238         if stream['rtmp_url']:
4239             self.to_screen(u'RTMP download detected')
4240             assert stream['video_url'].startswith('mp4:')
4241             info["url"] = stream["rtmp_url"]
4242             info["play_path"] = stream['video_url']
4243         else:
4244             assert stream["video_url"].endswith('.mp4')
4245             info["url"] = stream["video_url"]
4246         return [info]
4247
4248 class TumblrIE(InfoExtractor):
4249     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4250
4251     def _real_extract(self, url):
4252         m_url = re.match(self._VALID_URL, url)
4253         video_id = m_url.group('id')
4254         blog = m_url.group('blog_name')
4255
4256         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4257         webpage = self._download_webpage(url, video_id)
4258
4259         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4260         video = re.search(re_video, webpage)
4261         if video is None:
4262             self.to_screen("No video founded")
4263             return []
4264         video_url = video.group('video_url')
4265         ext = video.group('ext')
4266
4267         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4268         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4269
4270         # The only place where you can get a title, it's not complete,
4271         # but searching in other places doesn't work for all videos
4272         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4273         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4274
4275         return [{'id': video_id,
4276                  'url': video_url,
4277                  'title': title,
4278                  'thumbnail': thumb,
4279                  'ext': ext
4280                  }]
4281
4282
4283 def gen_extractors():
4284     """ Return a list of an instance of every supported extractor.
4285     The order does matter; the first extractor matched is the one handling the URL.
4286     """
4287     return [
4288         YoutubePlaylistIE(),
4289         YoutubeChannelIE(),
4290         YoutubeUserIE(),
4291         YoutubeSearchIE(),
4292         YoutubeIE(),
4293         MetacafeIE(),
4294         DailymotionIE(),
4295         GoogleSearchIE(),
4296         PhotobucketIE(),
4297         YahooIE(),
4298         YahooSearchIE(),
4299         DepositFilesIE(),
4300         FacebookIE(),
4301         BlipTVUserIE(),
4302         BlipTVIE(),
4303         VimeoIE(),
4304         MyVideoIE(),
4305         ComedyCentralIE(),
4306         EscapistIE(),
4307         CollegeHumorIE(),
4308         XVideosIE(),
4309         SoundcloudSetIE(),
4310         SoundcloudIE(),
4311         InfoQIE(),
4312         MixcloudIE(),
4313         StanfordOpenClassroomIE(),
4314         MTVIE(),
4315         YoukuIE(),
4316         XNXXIE(),
4317         YouJizzIE(),
4318         PornotubeIE(),
4319         YouPornIE(),
4320         GooglePlusIE(),
4321         ArteTvIE(),
4322         NBAIE(),
4323         WorldStarHipHopIE(),
4324         JustinTVIE(),
4325         FunnyOrDieIE(),
4326         SteamIE(),
4327         UstreamIE(),
4328         RBMARadioIE(),
4329         EightTracksIE(),
4330         KeekIE(),
4331         TEDIE(),
4332         MySpassIE(),
4333         SpiegelIE(),
4334         LiveLeakIE(),
4335         ARDIE(),
4336         TumblrIE(),
4337         GenericIE()
4338     ]
4339
4340 def get_info_extractor(ie_name):
4341     """Returns the info extractor class with the given ie_name"""
4342     return globals()[ie_name+'IE']