Merge branch 'master' of github.com:rg3/youtube-dl
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147     def to_screen(self, msg):
148         """Print msg to screen, prefixing it with '[ie_name]'"""
149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151     def report_extraction(self, id_or_name):
152         """Report information extraction."""
153         self.to_screen(u'%s: Extracting information' % id_or_name)
154
155     def report_download_webpage(self, video_id):
156         """Report webpage download."""
157         self.to_screen(u'%s: Downloading webpage' % video_id)
158
159     def report_age_confirmation(self):
160         """Report attempt to confirm age."""
161         self.to_screen(u'Confirming age')
162
163     #Methods for following #608
164     #They set the correct value of the '_type' key
165     def video_result(self, video_info):
166         """Returns a video"""
167         video_info['_type'] = 'video'
168         return video_info
169     def url_result(self, url, ie=None):
170         """Returns a url that points to a page that should be processed"""
171         #TODO: ie should be the class used for getting the info
172         video_info = {'_type': 'url',
173                       'url': url,
174                       'ie_key': ie}
175         return video_info
176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177         """Returns a playlist"""
178         video_info = {'_type': 'playlist',
179                       'entries': entries}
180         if playlist_id:
181             video_info['id'] = playlist_id
182         if playlist_title:
183             video_info['title'] = playlist_title
184         return video_info
185
186
187 class YoutubeIE(InfoExtractor):
188     """Information extractor for youtube.com."""
189
190     _VALID_URL = r"""^
191                      (
192                          (?:https?://)?                                       # http(s):// (optional)
193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
196                          (?:                                                  # the various things that can precede the ID:
197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
198                              |(?:                                             # or the v= param in all its forms
199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
202                                  v=
203                              )
204                          )?                                                   # optional -> youtube.com/xxxx is OK
205                      )?                                                       # all until now is optional -> you can pass the naked ID
206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
207                      (?(1).+)?                                                # if we found the ID, everything can follow
208                      $"""
209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213     _NETRC_MACHINE = 'youtube'
214     # Listed in order of quality
215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217     _video_extensions = {
218         '13': '3gp',
219         '17': 'mp4',
220         '18': 'mp4',
221         '22': 'mp4',
222         '37': 'mp4',
223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
224         '43': 'webm',
225         '44': 'webm',
226         '45': 'webm',
227         '46': 'webm',
228     }
229     _video_dimensions = {
230         '5': '240x400',
231         '6': '???',
232         '13': '???',
233         '17': '144x176',
234         '18': '360x640',
235         '22': '720x1280',
236         '34': '360x640',
237         '35': '480x854',
238         '37': '1080x1920',
239         '38': '3072x4096',
240         '43': '360x640',
241         '44': '480x854',
242         '45': '720x1280',
243         '46': '1080x1920',
244     }
245     IE_NAME = u'youtube'
246
247     @classmethod
248     def suitable(cls, url):
249         """Receives a URL and returns True if suitable for this IE."""
250         if YoutubePlaylistIE.suitable(url): return False
251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
252
253     def report_lang(self):
254         """Report attempt to set language."""
255         self.to_screen(u'Setting language')
256
257     def report_login(self):
258         """Report attempt to log in."""
259         self.to_screen(u'Logging in')
260
261     def report_video_webpage_download(self, video_id):
262         """Report attempt to download video webpage."""
263         self.to_screen(u'%s: Downloading video webpage' % video_id)
264
265     def report_video_info_webpage_download(self, video_id):
266         """Report attempt to download video info webpage."""
267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
268
269     def report_video_subtitles_download(self, video_id):
270         """Report attempt to download video info webpage."""
271         self.to_screen(u'%s: Checking available subtitles' % video_id)
272
273     def report_video_subtitles_request(self, video_id, sub_lang, format):
274         """Report attempt to download video info webpage."""
275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
276
277     def report_video_subtitles_available(self, video_id, sub_lang_list):
278         """Report available subtitles."""
279         sub_lang = ",".join(list(sub_lang_list.keys()))
280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
281
282     def report_information_extraction(self, video_id):
283         """Report attempt to extract video information."""
284         self.to_screen(u'%s: Extracting video information' % video_id)
285
286     def report_unavailable_format(self, video_id, format):
287         """Report extracted video URL."""
288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
289
290     def report_rtmp_download(self):
291         """Indicate the download will use the RTMP protocol."""
292         self.to_screen(u'RTMP download detected')
293
294     def _get_available_subtitles(self, video_id):
295         self.report_video_subtitles_download(video_id)
296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
297         try:
298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303         if not sub_lang_list:
304             return (u'video doesn\'t have subtitles', None)
305         return sub_lang_list
306
307     def _list_available_subtitles(self, video_id):
308         sub_lang_list = self._get_available_subtitles(video_id)
309         self.report_video_subtitles_available(video_id, sub_lang_list)
310
311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
312         """
313         Return tuple:
314         (error_message, sub_lang, sub)
315         """
316         self.report_video_subtitles_request(video_id, sub_lang, format)
317         params = compat_urllib_parse.urlencode({
318             'lang': sub_lang,
319             'name': sub_name,
320             'v': video_id,
321             'fmt': format,
322         })
323         url = 'http://www.youtube.com/api/timedtext?' + params
324         try:
325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
328         if not sub:
329             return (u'Did not fetch video subtitles', None, None)
330         return (None, sub_lang, sub)
331
332     def _extract_subtitle(self, video_id):
333         """
334         Return a list with a tuple:
335         [(error_message, sub_lang, sub)]
336         """
337         sub_lang_list = self._get_available_subtitles(video_id)
338         sub_format = self._downloader.params.get('subtitlesformat')
339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340             return [(sub_lang_list[0], None, None)]
341         if self._downloader.params.get('subtitleslang', False):
342             sub_lang = self._downloader.params.get('subtitleslang')
343         elif 'en' in sub_lang_list:
344             sub_lang = 'en'
345         else:
346             sub_lang = list(sub_lang_list.keys())[0]
347         if not sub_lang in sub_lang_list:
348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
349
350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
351         return [subtitle]
352
353     def _extract_all_subtitles(self, video_id):
354         sub_lang_list = self._get_available_subtitles(video_id)
355         sub_format = self._downloader.params.get('subtitlesformat')
356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357             return [(sub_lang_list[0], None, None)]
358         subtitles = []
359         for sub_lang in sub_lang_list:
360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361             subtitles.append(subtitle)
362         return subtitles
363
364     def _print_formats(self, formats):
365         print('Available formats:')
366         for x in formats:
367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
368
369     def _real_initialize(self):
370         if self._downloader is None:
371             return
372
373         username = None
374         password = None
375         downloader_params = self._downloader.params
376
377         # Attempt to use provided username and password or .netrc data
378         if downloader_params.get('username', None) is not None:
379             username = downloader_params['username']
380             password = downloader_params['password']
381         elif downloader_params.get('usenetrc', False):
382             try:
383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384                 if info is not None:
385                     username = info[0]
386                     password = info[2]
387                 else:
388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389             except (IOError, netrc.NetrcParseError) as err:
390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
391                 return
392
393         # Set language
394         request = compat_urllib_request.Request(self._LANG_URL)
395         try:
396             self.report_lang()
397             compat_urllib_request.urlopen(request).read()
398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
400             return
401
402         # No authentication to be performed
403         if username is None:
404             return
405
406         request = compat_urllib_request.Request(self._LOGIN_URL)
407         try:
408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411             return
412
413         galx = None
414         dsh = None
415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
416         if match:
417           galx = match.group(1)
418
419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
420         if match:
421           dsh = match.group(1)
422
423         # Log in
424         login_form_strs = {
425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
426                 u'Email': username,
427                 u'GALX': galx,
428                 u'Passwd': password,
429                 u'PersistentCookie': u'yes',
430                 u'_utf8': u'霱',
431                 u'bgresponse': u'js_disabled',
432                 u'checkConnection': u'',
433                 u'checkedDomains': u'youtube',
434                 u'dnConn': u'',
435                 u'dsh': dsh,
436                 u'pstMsg': u'0',
437                 u'rmShown': u'1',
438                 u'secTok': u'',
439                 u'signIn': u'Sign in',
440                 u'timeStmp': u'',
441                 u'service': u'youtube',
442                 u'uilel': u'3',
443                 u'hl': u'en_US',
444         }
445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
446         # chokes on unicode
447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
450         try:
451             self.report_login()
452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454                 self._downloader.report_warning(u'unable to log in: bad username or password')
455                 return
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
458             return
459
460         # Confirm age
461         age_form = {
462                 'next_url':     '/',
463                 'action_confirm':   'Confirm',
464                 }
465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
466         try:
467             self.report_age_confirmation()
468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
471             return
472
473     def _extract_id(self, url):
474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
475         if mobj is None:
476             self._downloader.report_error(u'invalid URL: %s' % url)
477             return
478         video_id = mobj.group(2)
479         return video_id
480
481     def _real_extract(self, url):
482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483         mobj = re.search(self._NEXT_URL_RE, url)
484         if mobj:
485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486         video_id = self._extract_id(url)
487
488         # Get video webpage
489         self.report_video_webpage_download(video_id)
490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491         request = compat_urllib_request.Request(url)
492         try:
493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
496             return
497
498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
499
500         # Attempt to extract SWF player URL
501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
502         if mobj is not None:
503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504         else:
505             player_url = None
506
507         # Get video info
508         self.report_video_info_webpage_download(video_id)
509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511                     % (video_id, el_type))
512             video_info_webpage = self._download_webpage(video_info_url, video_id,
513                                     note=False,
514                                     errnote='unable to download video info webpage')
515             video_info = compat_parse_qs(video_info_webpage)
516             if 'token' in video_info:
517                 break
518         if 'token' not in video_info:
519             if 'reason' in video_info:
520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
521             else:
522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
523             return
524
525         # Check for "rental" videos
526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527             self._downloader.report_error(u'"rental" videos not supported')
528             return
529
530         # Start extracting information
531         self.report_information_extraction(video_id)
532
533         # uploader
534         if 'author' not in video_info:
535             self._downloader.report_error(u'unable to extract uploader name')
536             return
537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
538
539         # uploader_id
540         video_uploader_id = None
541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542         if mobj is not None:
543             video_uploader_id = mobj.group(1)
544         else:
545             self._downloader.report_warning(u'unable to extract uploader nickname')
546
547         # title
548         if 'title' not in video_info:
549             self._downloader.report_error(u'unable to extract video title')
550             return
551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552
553         # thumbnail image
554         if 'thumbnail_url' not in video_info:
555             self._downloader.report_warning(u'unable to extract video thumbnail')
556             video_thumbnail = ''
557         else:   # don't panic if we can't find it
558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
559
560         # upload date
561         upload_date = None
562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
563         if mobj is not None:
564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565             upload_date = unified_strdate(upload_date)
566
567         # description
568         video_description = get_element_by_id("eow-description", video_webpage)
569         if video_description:
570             video_description = clean_html(video_description)
571         else:
572             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
573             if fd_mobj:
574                 video_description = unescapeHTML(fd_mobj.group(1))
575             else:
576                 video_description = u''
577
578         # subtitles
579         video_subtitles = None
580
581         if self._downloader.params.get('writesubtitles', False):
582             video_subtitles = self._extract_subtitle(video_id)
583             if video_subtitles:
584                 (sub_error, sub_lang, sub) = video_subtitles[0]
585                 if sub_error:
586                     self._downloader.report_error(sub_error)
587
588         if self._downloader.params.get('allsubtitles', False):
589             video_subtitles = self._extract_all_subtitles(video_id)
590             for video_subtitle in video_subtitles:
591                 (sub_error, sub_lang, sub) = video_subtitle
592                 if sub_error:
593                     self._downloader.report_error(sub_error)
594
595         if self._downloader.params.get('listsubtitles', False):
596             sub_lang_list = self._list_available_subtitles(video_id)
597             return
598
599         if 'length_seconds' not in video_info:
600             self._downloader.report_warning(u'unable to extract video duration')
601             video_duration = ''
602         else:
603             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604
605         # token
606         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
607
608         # Decide which formats to download
609         req_format = self._downloader.params.get('format', None)
610
611         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612             self.report_rtmp_download()
613             video_url_list = [(None, video_info['conn'][0])]
614         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
619
620             format_limit = self._downloader.params.get('format_limit', None)
621             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622             if format_limit is not None and format_limit in available_formats:
623                 format_list = available_formats[available_formats.index(format_limit):]
624             else:
625                 format_list = available_formats
626             existing_formats = [x for x in format_list if x in url_map]
627             if len(existing_formats) == 0:
628                 raise ExtractorError(u'no known formats available for video')
629             if self._downloader.params.get('listformats', None):
630                 self._print_formats(existing_formats)
631                 return
632             if req_format is None or req_format == 'best':
633                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634             elif req_format == 'worst':
635                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636             elif req_format in ('-1', 'all'):
637                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
638             else:
639                 # Specific formats. We pick the first in a slash-delimeted sequence.
640                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641                 req_formats = req_format.split('/')
642                 video_url_list = None
643                 for rf in req_formats:
644                     if rf in url_map:
645                         video_url_list = [(rf, url_map[rf])]
646                         break
647                 if video_url_list is None:
648                     raise ExtractorError(u'requested format not available')
649         else:
650             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
651
652         results = []
653         for format_param, video_real_url in video_url_list:
654             # Extension
655             video_extension = self._video_extensions.get(format_param, 'flv')
656
657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658                                               self._video_dimensions.get(format_param, '???'))
659
660             results.append({
661                 'id':       video_id,
662                 'url':      video_real_url,
663                 'uploader': video_uploader,
664                 'uploader_id': video_uploader_id,
665                 'upload_date':  upload_date,
666                 'title':    video_title,
667                 'ext':      video_extension,
668                 'format':   video_format,
669                 'thumbnail':    video_thumbnail,
670                 'description':  video_description,
671                 'player_url':   player_url,
672                 'subtitles':    video_subtitles,
673                 'duration':     video_duration
674             })
675         return results
676
677
678 class MetacafeIE(InfoExtractor):
679     """Information Extractor for metacafe.com."""
680
681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684     IE_NAME = u'metacafe'
685
686     def report_disclaimer(self):
687         """Report disclaimer retrieval."""
688         self.to_screen(u'Retrieving disclaimer')
689
690     def _real_initialize(self):
691         # Retrieve disclaimer
692         request = compat_urllib_request.Request(self._DISCLAIMER)
693         try:
694             self.report_disclaimer()
695             disclaimer = compat_urllib_request.urlopen(request).read()
696         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
698             return
699
700         # Confirm age
701         disclaimer_form = {
702             'filters': '0',
703             'submit': "Continue - I'm over 18",
704             }
705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706         try:
707             self.report_age_confirmation()
708             disclaimer = compat_urllib_request.urlopen(request).read()
709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
711             return
712
713     def _real_extract(self, url):
714         # Extract id and simplified title from URL
715         mobj = re.match(self._VALID_URL, url)
716         if mobj is None:
717             self._downloader.report_error(u'invalid URL: %s' % url)
718             return
719
720         video_id = mobj.group(1)
721
722         # Check if video comes from YouTube
723         mobj2 = re.match(r'^yt-(.*)$', video_id)
724         if mobj2 is not None:
725             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
726
727         # Retrieve video webpage to extract further information
728         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
729
730         # Extract URL, uploader and title from webpage
731         self.report_extraction(video_id)
732         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
733         if mobj is not None:
734             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735             video_extension = mediaURL[-3:]
736
737             # Extract gdaKey if available
738             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
739             if mobj is None:
740                 video_url = mediaURL
741             else:
742                 gdaKey = mobj.group(1)
743                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
744         else:
745             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
746             if mobj is None:
747                 self._downloader.report_error(u'unable to extract media URL')
748                 return
749             vardict = compat_parse_qs(mobj.group(1))
750             if 'mediaData' not in vardict:
751                 self._downloader.report_error(u'unable to extract media URL')
752                 return
753             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
754             if mobj is None:
755                 self._downloader.report_error(u'unable to extract media URL')
756                 return
757             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758             video_extension = mediaURL[-3:]
759             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
760
761         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
762         if mobj is None:
763             self._downloader.report_error(u'unable to extract title')
764             return
765         video_title = mobj.group(1).decode('utf-8')
766
767         mobj = re.search(r'submitter=(.*?);', webpage)
768         if mobj is None:
769             self._downloader.report_error(u'unable to extract uploader nickname')
770             return
771         video_uploader = mobj.group(1)
772
773         return [{
774             'id':       video_id.decode('utf-8'),
775             'url':      video_url.decode('utf-8'),
776             'uploader': video_uploader.decode('utf-8'),
777             'upload_date':  None,
778             'title':    video_title,
779             'ext':      video_extension.decode('utf-8'),
780         }]
781
782
783 class DailymotionIE(InfoExtractor):
784     """Information Extractor for Dailymotion"""
785
786     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787     IE_NAME = u'dailymotion'
788     _WORKING = False
789
790     def _real_extract(self, url):
791         # Extract id and simplified title from URL
792         mobj = re.match(self._VALID_URL, url)
793         if mobj is None:
794             self._downloader.report_error(u'invalid URL: %s' % url)
795             return
796
797         video_id = mobj.group(1).split('_')[0].split('?')[0]
798
799         video_extension = 'mp4'
800
801         # Retrieve video webpage to extract further information
802         request = compat_urllib_request.Request(url)
803         request.add_header('Cookie', 'family_filter=off')
804         webpage = self._download_webpage(request, video_id)
805
806         # Extract URL, uploader and title from webpage
807         self.report_extraction(video_id)
808         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
809         if mobj is None:
810             self._downloader.report_error(u'unable to extract media URL')
811             return
812         flashvars = compat_urllib_parse.unquote(mobj.group(1))
813
814         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815             if key in flashvars:
816                 max_quality = key
817                 self.to_screen(u'Using %s' % key)
818                 break
819         else:
820             self._downloader.report_error(u'unable to extract video URL')
821             return
822
823         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
824         if mobj is None:
825             self._downloader.report_error(u'unable to extract video URL')
826             return
827
828         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
829
830         # TODO: support choosing qualities
831
832         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
833         if mobj is None:
834             self._downloader.report_error(u'unable to extract title')
835             return
836         video_title = unescapeHTML(mobj.group('title'))
837
838         video_uploader = None
839         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
840         if mobj is None:
841             # lookin for official user
842             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
843             if mobj_official is None:
844                 self._downloader.report_warning(u'unable to extract uploader nickname')
845             else:
846                 video_uploader = mobj_official.group(1)
847         else:
848             video_uploader = mobj.group(1)
849
850         video_upload_date = None
851         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
852         if mobj is not None:
853             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
854
855         return [{
856             'id':       video_id,
857             'url':      video_url,
858             'uploader': video_uploader,
859             'upload_date':  video_upload_date,
860             'title':    video_title,
861             'ext':      video_extension,
862         }]
863
864
865 class PhotobucketIE(InfoExtractor):
866     """Information extractor for photobucket.com."""
867
868     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
869     IE_NAME = u'photobucket'
870
871     def _real_extract(self, url):
872         # Extract id from URL
873         mobj = re.match(self._VALID_URL, url)
874         if mobj is None:
875             self._downloader.report_error(u'Invalid URL: %s' % url)
876             return
877
878         video_id = mobj.group(1)
879
880         video_extension = 'flv'
881
882         # Retrieve video webpage to extract further information
883         request = compat_urllib_request.Request(url)
884         try:
885             self.report_download_webpage(video_id)
886             webpage = compat_urllib_request.urlopen(request).read()
887         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
888             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
889             return
890
891         # Extract URL, uploader, and title from webpage
892         self.report_extraction(video_id)
893         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
894         if mobj is None:
895             self._downloader.report_error(u'unable to extract media URL')
896             return
897         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
898
899         video_url = mediaURL
900
901         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
902         if mobj is None:
903             self._downloader.report_error(u'unable to extract title')
904             return
905         video_title = mobj.group(1).decode('utf-8')
906
907         video_uploader = mobj.group(2).decode('utf-8')
908
909         return [{
910             'id':       video_id.decode('utf-8'),
911             'url':      video_url.decode('utf-8'),
912             'uploader': video_uploader,
913             'upload_date':  None,
914             'title':    video_title,
915             'ext':      video_extension.decode('utf-8'),
916         }]
917
918
919 class YahooIE(InfoExtractor):
920     """Information extractor for video.yahoo.com."""
921
922     _WORKING = False
923     # _VALID_URL matches all Yahoo! Video URLs
924     # _VPAGE_URL matches only the extractable '/watch/' URLs
925     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
926     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
927     IE_NAME = u'video.yahoo'
928
929     def _real_extract(self, url, new_video=True):
930         # Extract ID from URL
931         mobj = re.match(self._VALID_URL, url)
932         if mobj is None:
933             self._downloader.report_error(u'Invalid URL: %s' % url)
934             return
935
936         video_id = mobj.group(2)
937         video_extension = 'flv'
938
939         # Rewrite valid but non-extractable URLs as
940         # extractable English language /watch/ URLs
941         if re.match(self._VPAGE_URL, url) is None:
942             request = compat_urllib_request.Request(url)
943             try:
944                 webpage = compat_urllib_request.urlopen(request).read()
945             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
946                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
947                 return
948
949             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
950             if mobj is None:
951                 self._downloader.report_error(u'Unable to extract id field')
952                 return
953             yahoo_id = mobj.group(1)
954
955             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
956             if mobj is None:
957                 self._downloader.report_error(u'Unable to extract vid field')
958                 return
959             yahoo_vid = mobj.group(1)
960
961             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
962             return self._real_extract(url, new_video=False)
963
964         # Retrieve video webpage to extract further information
965         request = compat_urllib_request.Request(url)
966         try:
967             self.report_download_webpage(video_id)
968             webpage = compat_urllib_request.urlopen(request).read()
969         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
970             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
971             return
972
973         # Extract uploader and title from webpage
974         self.report_extraction(video_id)
975         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
976         if mobj is None:
977             self._downloader.report_error(u'unable to extract video title')
978             return
979         video_title = mobj.group(1).decode('utf-8')
980
981         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
982         if mobj is None:
983             self._downloader.report_error(u'unable to extract video uploader')
984             return
985         video_uploader = mobj.group(1).decode('utf-8')
986
987         # Extract video thumbnail
988         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
989         if mobj is None:
990             self._downloader.report_error(u'unable to extract video thumbnail')
991             return
992         video_thumbnail = mobj.group(1).decode('utf-8')
993
994         # Extract video description
995         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
996         if mobj is None:
997             self._downloader.report_error(u'unable to extract video description')
998             return
999         video_description = mobj.group(1).decode('utf-8')
1000         if not video_description:
1001             video_description = 'No description available.'
1002
1003         # Extract video height and width
1004         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1005         if mobj is None:
1006             self._downloader.report_error(u'unable to extract video height')
1007             return
1008         yv_video_height = mobj.group(1)
1009
1010         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1011         if mobj is None:
1012             self._downloader.report_error(u'unable to extract video width')
1013             return
1014         yv_video_width = mobj.group(1)
1015
1016         # Retrieve video playlist to extract media URL
1017         # I'm not completely sure what all these options are, but we
1018         # seem to need most of them, otherwise the server sends a 401.
1019         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1020         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1021         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1022                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1023                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1024         try:
1025             self.report_download_webpage(video_id)
1026             webpage = compat_urllib_request.urlopen(request).read()
1027         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1028             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1029             return
1030
1031         # Extract media URL from playlist XML
1032         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1033         if mobj is None:
1034             self._downloader.report_error(u'Unable to extract media URL')
1035             return
1036         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1037         video_url = unescapeHTML(video_url)
1038
1039         return [{
1040             'id':       video_id.decode('utf-8'),
1041             'url':      video_url,
1042             'uploader': video_uploader,
1043             'upload_date':  None,
1044             'title':    video_title,
1045             'ext':      video_extension.decode('utf-8'),
1046             'thumbnail':    video_thumbnail.decode('utf-8'),
1047             'description':  video_description,
1048         }]
1049
1050
1051 class VimeoIE(InfoExtractor):
1052     """Information extractor for vimeo.com."""
1053
1054     # _VALID_URL matches Vimeo URLs
1055     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1056     IE_NAME = u'vimeo'
1057
1058     def _real_extract(self, url, new_video=True):
1059         # Extract ID from URL
1060         mobj = re.match(self._VALID_URL, url)
1061         if mobj is None:
1062             self._downloader.report_error(u'Invalid URL: %s' % url)
1063             return
1064
1065         video_id = mobj.group('id')
1066         if not mobj.group('proto'):
1067             url = 'https://' + url
1068         if mobj.group('direct_link'):
1069             url = 'https://vimeo.com/' + video_id
1070
1071         # Retrieve video webpage to extract further information
1072         request = compat_urllib_request.Request(url, None, std_headers)
1073         try:
1074             self.report_download_webpage(video_id)
1075             webpage_bytes = compat_urllib_request.urlopen(request).read()
1076             webpage = webpage_bytes.decode('utf-8')
1077         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1078             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1079             return
1080
1081         # Now we begin extracting as much information as we can from what we
1082         # retrieved. First we extract the information common to all extractors,
1083         # and latter we extract those that are Vimeo specific.
1084         self.report_extraction(video_id)
1085
1086         # Extract the config JSON
1087         try:
1088             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1089             config = json.loads(config)
1090         except:
1091             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1092                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1093             else:
1094                 self._downloader.report_error(u'unable to extract info section')
1095             return
1096
1097         # Extract title
1098         video_title = config["video"]["title"]
1099
1100         # Extract uploader and uploader_id
1101         video_uploader = config["video"]["owner"]["name"]
1102         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1103
1104         # Extract video thumbnail
1105         video_thumbnail = config["video"]["thumbnail"]
1106
1107         # Extract video description
1108         video_description = get_element_by_attribute("itemprop", "description", webpage)
1109         if video_description: video_description = clean_html(video_description)
1110         else: video_description = u''
1111
1112         # Extract upload date
1113         video_upload_date = None
1114         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115         if mobj is not None:
1116             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1117
1118         # Vimeo specific: extract request signature and timestamp
1119         sig = config['request']['signature']
1120         timestamp = config['request']['timestamp']
1121
1122         # Vimeo specific: extract video codec and quality information
1123         # First consider quality, then codecs, then take everything
1124         # TODO bind to format param
1125         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126         files = { 'hd': [], 'sd': [], 'other': []}
1127         for codec_name, codec_extension in codecs:
1128             if codec_name in config["video"]["files"]:
1129                 if 'hd' in config["video"]["files"][codec_name]:
1130                     files['hd'].append((codec_name, codec_extension, 'hd'))
1131                 elif 'sd' in config["video"]["files"][codec_name]:
1132                     files['sd'].append((codec_name, codec_extension, 'sd'))
1133                 else:
1134                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135
1136         for quality in ('hd', 'sd', 'other'):
1137             if len(files[quality]) > 0:
1138                 video_quality = files[quality][0][2]
1139                 video_codec = files[quality][0][0]
1140                 video_extension = files[quality][0][1]
1141                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1142                 break
1143         else:
1144             self._downloader.report_error(u'no known codec found')
1145             return
1146
1147         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1148                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1149
1150         return [{
1151             'id':       video_id,
1152             'url':      video_url,
1153             'uploader': video_uploader,
1154             'uploader_id': video_uploader_id,
1155             'upload_date':  video_upload_date,
1156             'title':    video_title,
1157             'ext':      video_extension,
1158             'thumbnail':    video_thumbnail,
1159             'description':  video_description,
1160         }]
1161
1162
1163 class ArteTvIE(InfoExtractor):
1164     """arte.tv information extractor."""
1165
1166     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1167     _LIVE_URL = r'index-[0-9]+\.html$'
1168
1169     IE_NAME = u'arte.tv'
1170
1171     def fetch_webpage(self, url):
1172         request = compat_urllib_request.Request(url)
1173         try:
1174             self.report_download_webpage(url)
1175             webpage = compat_urllib_request.urlopen(request).read()
1176         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1177             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1178             return
1179         except ValueError as err:
1180             self._downloader.report_error(u'Invalid URL: %s' % url)
1181             return
1182         return webpage
1183
1184     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1185         page = self.fetch_webpage(url)
1186         mobj = re.search(regex, page, regexFlags)
1187         info = {}
1188
1189         if mobj is None:
1190             self._downloader.report_error(u'Invalid URL: %s' % url)
1191             return
1192
1193         for (i, key, err) in matchTuples:
1194             if mobj.group(i) is None:
1195                 self._downloader.report_error(err)
1196                 return
1197             else:
1198                 info[key] = mobj.group(i)
1199
1200         return info
1201
1202     def extractLiveStream(self, url):
1203         video_lang = url.split('/')[-4]
1204         info = self.grep_webpage(
1205             url,
1206             r'src="(.*?/videothek_js.*?\.js)',
1207             0,
1208             [
1209                 (1, 'url', u'Invalid URL: %s' % url)
1210             ]
1211         )
1212         http_host = url.split('/')[2]
1213         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1214         info = self.grep_webpage(
1215             next_url,
1216             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1217                 '(http://.*?\.swf).*?' +
1218                 '(rtmp://.*?)\'',
1219             re.DOTALL,
1220             [
1221                 (1, 'path',   u'could not extract video path: %s' % url),
1222                 (2, 'player', u'could not extract video player: %s' % url),
1223                 (3, 'url',    u'could not extract video url: %s' % url)
1224             ]
1225         )
1226         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1227
1228     def extractPlus7Stream(self, url):
1229         video_lang = url.split('/')[-3]
1230         info = self.grep_webpage(
1231             url,
1232             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1233             0,
1234             [
1235                 (1, 'url', u'Invalid URL: %s' % url)
1236             ]
1237         )
1238         next_url = compat_urllib_parse.unquote(info.get('url'))
1239         info = self.grep_webpage(
1240             next_url,
1241             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1242             0,
1243             [
1244                 (1, 'url', u'Could not find <video> tag: %s' % url)
1245             ]
1246         )
1247         next_url = compat_urllib_parse.unquote(info.get('url'))
1248
1249         info = self.grep_webpage(
1250             next_url,
1251             r'<video id="(.*?)".*?>.*?' +
1252                 '<name>(.*?)</name>.*?' +
1253                 '<dateVideo>(.*?)</dateVideo>.*?' +
1254                 '<url quality="hd">(.*?)</url>',
1255             re.DOTALL,
1256             [
1257                 (1, 'id',    u'could not extract video id: %s' % url),
1258                 (2, 'title', u'could not extract video title: %s' % url),
1259                 (3, 'date',  u'could not extract video date: %s' % url),
1260                 (4, 'url',   u'could not extract video url: %s' % url)
1261             ]
1262         )
1263
1264         return {
1265             'id':           info.get('id'),
1266             'url':          compat_urllib_parse.unquote(info.get('url')),
1267             'uploader':     u'arte.tv',
1268             'upload_date':  info.get('date'),
1269             'title':        info.get('title').decode('utf-8'),
1270             'ext':          u'mp4',
1271             'format':       u'NA',
1272             'player_url':   None,
1273         }
1274
1275     def _real_extract(self, url):
1276         video_id = url.split('/')[-1]
1277         self.report_extraction(video_id)
1278
1279         if re.search(self._LIVE_URL, video_id) is not None:
1280             self.extractLiveStream(url)
1281             return
1282         else:
1283             info = self.extractPlus7Stream(url)
1284
1285         return [info]
1286
1287
1288 class GenericIE(InfoExtractor):
1289     """Generic last-resort information extractor."""
1290
1291     _VALID_URL = r'.*'
1292     IE_NAME = u'generic'
1293
1294     def report_download_webpage(self, video_id):
1295         """Report webpage download."""
1296         if not self._downloader.params.get('test', False):
1297             self._downloader.report_warning(u'Falling back on generic information extractor.')
1298         super(GenericIE, self).report_download_webpage(video_id)
1299
1300     def report_following_redirect(self, new_url):
1301         """Report information extraction."""
1302         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1303
1304     def _test_redirect(self, url):
1305         """Check if it is a redirect, like url shorteners, in case return the new url."""
1306         class HeadRequest(compat_urllib_request.Request):
1307             def get_method(self):
1308                 return "HEAD"
1309
1310         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1311             """
1312             Subclass the HTTPRedirectHandler to make it use our
1313             HeadRequest also on the redirected URL
1314             """
1315             def redirect_request(self, req, fp, code, msg, headers, newurl):
1316                 if code in (301, 302, 303, 307):
1317                     newurl = newurl.replace(' ', '%20')
1318                     newheaders = dict((k,v) for k,v in req.headers.items()
1319                                       if k.lower() not in ("content-length", "content-type"))
1320                     return HeadRequest(newurl,
1321                                        headers=newheaders,
1322                                        origin_req_host=req.get_origin_req_host(),
1323                                        unverifiable=True)
1324                 else:
1325                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1326
1327         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1328             """
1329             Fallback to GET if HEAD is not allowed (405 HTTP error)
1330             """
1331             def http_error_405(self, req, fp, code, msg, headers):
1332                 fp.read()
1333                 fp.close()
1334
1335                 newheaders = dict((k,v) for k,v in req.headers.items()
1336                                   if k.lower() not in ("content-length", "content-type"))
1337                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1338                                                  headers=newheaders,
1339                                                  origin_req_host=req.get_origin_req_host(),
1340                                                  unverifiable=True))
1341
1342         # Build our opener
1343         opener = compat_urllib_request.OpenerDirector()
1344         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1345                         HTTPMethodFallback, HEADRedirectHandler,
1346                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1347             opener.add_handler(handler())
1348
1349         response = opener.open(HeadRequest(url))
1350         new_url = response.geturl()
1351
1352         if url == new_url:
1353             return False
1354
1355         self.report_following_redirect(new_url)
1356         return new_url
1357
1358     def _real_extract(self, url):
1359         new_url = self._test_redirect(url)
1360         if new_url: return [self.url_result(new_url)]
1361
1362         video_id = url.split('/')[-1]
1363         try:
1364             webpage = self._download_webpage(url, video_id)
1365         except ValueError as err:
1366             # since this is the last-resort InfoExtractor, if
1367             # this error is thrown, it'll be thrown here
1368             self._downloader.report_error(u'Invalid URL: %s' % url)
1369             return
1370
1371         self.report_extraction(video_id)
1372         # Start with something easy: JW Player in SWFObject
1373         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1374         if mobj is None:
1375             # Broaden the search a little bit
1376             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1377         if mobj is None:
1378             # Broaden the search a little bit: JWPlayer JS loader
1379             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1380         if mobj is None:
1381             self._downloader.report_error(u'Invalid URL: %s' % url)
1382             return
1383
1384         # It's possible that one of the regexes
1385         # matched, but returned an empty group:
1386         if mobj.group(1) is None:
1387             self._downloader.report_error(u'Invalid URL: %s' % url)
1388             return
1389
1390         video_url = compat_urllib_parse.unquote(mobj.group(1))
1391         video_id = os.path.basename(video_url)
1392
1393         # here's a fun little line of code for you:
1394         video_extension = os.path.splitext(video_id)[1][1:]
1395         video_id = os.path.splitext(video_id)[0]
1396
1397         # it's tempting to parse this further, but you would
1398         # have to take into account all the variations like
1399         #   Video Title - Site Name
1400         #   Site Name | Video Title
1401         #   Video Title - Tagline | Site Name
1402         # and so on and so forth; it's just not practical
1403         mobj = re.search(r'<title>(.*)</title>', webpage)
1404         if mobj is None:
1405             self._downloader.report_error(u'unable to extract title')
1406             return
1407         video_title = mobj.group(1)
1408
1409         # video uploader is domain name
1410         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1411         if mobj is None:
1412             self._downloader.report_error(u'unable to extract title')
1413             return
1414         video_uploader = mobj.group(1)
1415
1416         return [{
1417             'id':       video_id,
1418             'url':      video_url,
1419             'uploader': video_uploader,
1420             'upload_date':  None,
1421             'title':    video_title,
1422             'ext':      video_extension,
1423         }]
1424
1425
1426 class YoutubeSearchIE(InfoExtractor):
1427     """Information Extractor for YouTube search queries."""
1428     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1429     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1430     _max_youtube_results = 1000
1431     IE_NAME = u'youtube:search'
1432
1433     def report_download_page(self, query, pagenum):
1434         """Report attempt to download search page with given number."""
1435         query = query.decode(preferredencoding())
1436         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1437
1438     def _real_extract(self, query):
1439         mobj = re.match(self._VALID_URL, query)
1440         if mobj is None:
1441             self._downloader.report_error(u'invalid search query "%s"' % query)
1442             return
1443
1444         prefix, query = query.split(':')
1445         prefix = prefix[8:]
1446         query = query.encode('utf-8')
1447         if prefix == '':
1448             return self._get_n_results(query, 1)
1449         elif prefix == 'all':
1450             self._get_n_results(query, self._max_youtube_results)
1451         else:
1452             try:
1453                 n = int(prefix)
1454                 if n <= 0:
1455                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1456                     return
1457                 elif n > self._max_youtube_results:
1458                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1459                     n = self._max_youtube_results
1460                 return self._get_n_results(query, n)
1461             except ValueError: # parsing prefix as integer fails
1462                 return self._get_n_results(query, 1)
1463
1464     def _get_n_results(self, query, n):
1465         """Get a specified number of results for a query"""
1466
1467         video_ids = []
1468         pagenum = 0
1469         limit = n
1470
1471         while (50 * pagenum) < limit:
1472             self.report_download_page(query, pagenum+1)
1473             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474             request = compat_urllib_request.Request(result_url)
1475             try:
1476                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1477             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1478                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1479                 return
1480             api_response = json.loads(data)['data']
1481
1482             if not 'items' in api_response:
1483                 self._downloader.report_error(u'[youtube] No video results')
1484                 return
1485
1486             new_ids = list(video['id'] for video in api_response['items'])
1487             video_ids += new_ids
1488
1489             limit = min(n, api_response['totalItems'])
1490             pagenum += 1
1491
1492         if len(video_ids) > n:
1493             video_ids = video_ids[:n]
1494         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495         return videos
1496
1497
1498 class GoogleSearchIE(InfoExtractor):
1499     """Information Extractor for Google Video search queries."""
1500     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504     _max_google_results = 1000
1505     IE_NAME = u'video.google:search'
1506
1507     def report_download_page(self, query, pagenum):
1508         """Report attempt to download playlist page with given number."""
1509         query = query.decode(preferredencoding())
1510         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1511
1512     def _real_extract(self, query):
1513         mobj = re.match(self._VALID_URL, query)
1514         if mobj is None:
1515             self._downloader.report_error(u'invalid search query "%s"' % query)
1516             return
1517
1518         prefix, query = query.split(':')
1519         prefix = prefix[8:]
1520         query = query.encode('utf-8')
1521         if prefix == '':
1522             self._download_n_results(query, 1)
1523             return
1524         elif prefix == 'all':
1525             self._download_n_results(query, self._max_google_results)
1526             return
1527         else:
1528             try:
1529                 n = int(prefix)
1530                 if n <= 0:
1531                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1532                     return
1533                 elif n > self._max_google_results:
1534                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1535                     n = self._max_google_results
1536                 self._download_n_results(query, n)
1537                 return
1538             except ValueError: # parsing prefix as integer fails
1539                 self._download_n_results(query, 1)
1540                 return
1541
1542     def _download_n_results(self, query, n):
1543         """Downloads a specified number of results for a query"""
1544
1545         video_ids = []
1546         pagenum = 0
1547
1548         while True:
1549             self.report_download_page(query, pagenum)
1550             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1551             request = compat_urllib_request.Request(result_url)
1552             try:
1553                 page = compat_urllib_request.urlopen(request).read()
1554             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1555                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1556                 return
1557
1558             # Extract video identifiers
1559             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560                 video_id = mobj.group(1)
1561                 if video_id not in video_ids:
1562                     video_ids.append(video_id)
1563                     if len(video_ids) == n:
1564                         # Specified n videos reached
1565                         for id in video_ids:
1566                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1567                         return
1568
1569             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570                 for id in video_ids:
1571                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572                 return
1573
1574             pagenum = pagenum + 1
1575
1576
1577 class YahooSearchIE(InfoExtractor):
1578     """Information Extractor for Yahoo! Video search queries."""
1579
1580     _WORKING = False
1581     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1582     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1583     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1584     _MORE_PAGES_INDICATOR = r'\s*Next'
1585     _max_yahoo_results = 1000
1586     IE_NAME = u'video.yahoo:search'
1587
1588     def report_download_page(self, query, pagenum):
1589         """Report attempt to download playlist page with given number."""
1590         query = query.decode(preferredencoding())
1591         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1592
1593     def _real_extract(self, query):
1594         mobj = re.match(self._VALID_URL, query)
1595         if mobj is None:
1596             self._downloader.report_error(u'invalid search query "%s"' % query)
1597             return
1598
1599         prefix, query = query.split(':')
1600         prefix = prefix[8:]
1601         query = query.encode('utf-8')
1602         if prefix == '':
1603             self._download_n_results(query, 1)
1604             return
1605         elif prefix == 'all':
1606             self._download_n_results(query, self._max_yahoo_results)
1607             return
1608         else:
1609             try:
1610                 n = int(prefix)
1611                 if n <= 0:
1612                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1613                     return
1614                 elif n > self._max_yahoo_results:
1615                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1616                     n = self._max_yahoo_results
1617                 self._download_n_results(query, n)
1618                 return
1619             except ValueError: # parsing prefix as integer fails
1620                 self._download_n_results(query, 1)
1621                 return
1622
1623     def _download_n_results(self, query, n):
1624         """Downloads a specified number of results for a query"""
1625
1626         video_ids = []
1627         already_seen = set()
1628         pagenum = 1
1629
1630         while True:
1631             self.report_download_page(query, pagenum)
1632             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1633             request = compat_urllib_request.Request(result_url)
1634             try:
1635                 page = compat_urllib_request.urlopen(request).read()
1636             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1637                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1638                 return
1639
1640             # Extract video identifiers
1641             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1642                 video_id = mobj.group(1)
1643                 if video_id not in already_seen:
1644                     video_ids.append(video_id)
1645                     already_seen.add(video_id)
1646                     if len(video_ids) == n:
1647                         # Specified n videos reached
1648                         for id in video_ids:
1649                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1650                         return
1651
1652             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1653                 for id in video_ids:
1654                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655                 return
1656
1657             pagenum = pagenum + 1
1658
1659
1660 class YoutubePlaylistIE(InfoExtractor):
1661     """Information Extractor for YouTube playlists."""
1662
1663     _VALID_URL = r"""(?:
1664                         (?:https?://)?
1665                         (?:\w+\.)?
1666                         youtube\.com/
1667                         (?:
1668                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1669                            \? (?:.*?&)*? (?:p|a|list)=
1670                         |  p/
1671                         )
1672                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1673                         .*
1674                      |
1675                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1676                      )"""
1677     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1678     _MAX_RESULTS = 50
1679     IE_NAME = u'youtube:playlist'
1680
1681     @classmethod
1682     def suitable(cls, url):
1683         """Receives a URL and returns True if suitable for this IE."""
1684         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1685
1686     def report_download_page(self, playlist_id, pagenum):
1687         """Report attempt to download playlist page with given number."""
1688         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1689
1690     def _real_extract(self, url):
1691         # Extract playlist id
1692         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1693         if mobj is None:
1694             self._downloader.report_error(u'invalid url: %s' % url)
1695             return
1696
1697         # Download playlist videos from API
1698         playlist_id = mobj.group(1) or mobj.group(2)
1699         page_num = 1
1700         videos = []
1701
1702         while True:
1703             self.report_download_page(playlist_id, page_num)
1704
1705             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1706             try:
1707                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1708             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1709                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1710                 return
1711
1712             try:
1713                 response = json.loads(page)
1714             except ValueError as err:
1715                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1716                 return
1717
1718             if 'feed' not in response:
1719                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1720                 return
1721             playlist_title = response['feed']['title']['$t']
1722             if 'entry' not in response['feed']:
1723                 # Number of videos is a multiple of self._MAX_RESULTS
1724                 break
1725
1726             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1727                         for entry in response['feed']['entry']
1728                         if 'content' in entry ]
1729
1730             if len(response['feed']['entry']) < self._MAX_RESULTS:
1731                 break
1732             page_num += 1
1733
1734         videos = [v[1] for v in sorted(videos)]
1735
1736         url_results = [self.url_result(url, 'Youtube') for url in videos]
1737         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1738
1739
1740 class YoutubeChannelIE(InfoExtractor):
1741     """Information Extractor for YouTube channels."""
1742
1743     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1744     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1745     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1746     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1747     IE_NAME = u'youtube:channel'
1748
1749     def report_download_page(self, channel_id, pagenum):
1750         """Report attempt to download channel page with given number."""
1751         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1752
1753     def extract_videos_from_page(self, page):
1754         ids_in_page = []
1755         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1756             if mobj.group(1) not in ids_in_page:
1757                 ids_in_page.append(mobj.group(1))
1758         return ids_in_page
1759
1760     def _real_extract(self, url):
1761         # Extract channel id
1762         mobj = re.match(self._VALID_URL, url)
1763         if mobj is None:
1764             self._downloader.report_error(u'invalid url: %s' % url)
1765             return
1766
1767         # Download channel page
1768         channel_id = mobj.group(1)
1769         video_ids = []
1770         pagenum = 1
1771
1772         self.report_download_page(channel_id, pagenum)
1773         url = self._TEMPLATE_URL % (channel_id, pagenum)
1774         request = compat_urllib_request.Request(url)
1775         try:
1776             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1777         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1779             return
1780
1781         # Extract video identifiers
1782         ids_in_page = self.extract_videos_from_page(page)
1783         video_ids.extend(ids_in_page)
1784
1785         # Download any subsequent channel pages using the json-based channel_ajax query
1786         if self._MORE_PAGES_INDICATOR in page:
1787             while True:
1788                 pagenum = pagenum + 1
1789
1790                 self.report_download_page(channel_id, pagenum)
1791                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1792                 request = compat_urllib_request.Request(url)
1793                 try:
1794                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1795                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1796                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1797                     return
1798
1799                 page = json.loads(page)
1800
1801                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1802                 video_ids.extend(ids_in_page)
1803
1804                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1805                     break
1806
1807         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808
1809         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1810         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1811         return [self.playlist_result(url_entries, channel_id)]
1812
1813
1814 class YoutubeUserIE(InfoExtractor):
1815     """Information Extractor for YouTube users."""
1816
1817     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1818     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1819     _GDATA_PAGE_SIZE = 50
1820     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1821     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1822     IE_NAME = u'youtube:user'
1823
1824     def report_download_page(self, username, start_index):
1825         """Report attempt to download user page."""
1826         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1827                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1828
1829     def _real_extract(self, url):
1830         # Extract username
1831         mobj = re.match(self._VALID_URL, url)
1832         if mobj is None:
1833             self._downloader.report_error(u'invalid url: %s' % url)
1834             return
1835
1836         username = mobj.group(1)
1837
1838         # Download video ids using YouTube Data API. Result size per
1839         # query is limited (currently to 50 videos) so we need to query
1840         # page by page until there are no video ids - it means we got
1841         # all of them.
1842
1843         video_ids = []
1844         pagenum = 0
1845
1846         while True:
1847             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1848             self.report_download_page(username, start_index)
1849
1850             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1851
1852             try:
1853                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1854             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1856                 return
1857
1858             # Extract video identifiers
1859             ids_in_page = []
1860
1861             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1862                 if mobj.group(1) not in ids_in_page:
1863                     ids_in_page.append(mobj.group(1))
1864
1865             video_ids.extend(ids_in_page)
1866
1867             # A little optimization - if current page is not
1868             # "full", ie. does not contain PAGE_SIZE video ids then
1869             # we can assume that this page is the last one - there
1870             # are no more ids on further pages - no need to query
1871             # again.
1872
1873             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1874                 break
1875
1876             pagenum += 1
1877
1878         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1879         url_results = [self.url_result(url, 'Youtube') for url in urls]
1880         return [self.playlist_result(url_results, playlist_title = username)]
1881
1882
1883 class BlipTVUserIE(InfoExtractor):
1884     """Information Extractor for blip.tv users."""
1885
1886     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1887     _PAGE_SIZE = 12
1888     IE_NAME = u'blip.tv:user'
1889
1890     def report_download_page(self, username, pagenum):
1891         """Report attempt to download user page."""
1892         self.to_screen(u'user %s: Downloading video ids from page %d' %
1893                 (username, pagenum))
1894
1895     def _real_extract(self, url):
1896         # Extract username
1897         mobj = re.match(self._VALID_URL, url)
1898         if mobj is None:
1899             self._downloader.report_error(u'invalid url: %s' % url)
1900             return
1901
1902         username = mobj.group(1)
1903
1904         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1905
1906         request = compat_urllib_request.Request(url)
1907
1908         try:
1909             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1910             mobj = re.search(r'data-users-id="([^"]+)"', page)
1911             page_base = page_base % mobj.group(1)
1912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1913             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1914             return
1915
1916
1917         # Download video ids using BlipTV Ajax calls. Result size per
1918         # query is limited (currently to 12 videos) so we need to query
1919         # page by page until there are no video ids - it means we got
1920         # all of them.
1921
1922         video_ids = []
1923         pagenum = 1
1924
1925         while True:
1926             self.report_download_page(username, pagenum)
1927             url = page_base + "&page=" + str(pagenum)
1928             request = compat_urllib_request.Request( url )
1929             try:
1930                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1932                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1933                 return
1934
1935             # Extract video identifiers
1936             ids_in_page = []
1937
1938             for mobj in re.finditer(r'href="/([^"]+)"', page):
1939                 if mobj.group(1) not in ids_in_page:
1940                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1941
1942             video_ids.extend(ids_in_page)
1943
1944             # A little optimization - if current page is not
1945             # "full", ie. does not contain PAGE_SIZE video ids then
1946             # we can assume that this page is the last one - there
1947             # are no more ids on further pages - no need to query
1948             # again.
1949
1950             if len(ids_in_page) < self._PAGE_SIZE:
1951                 break
1952
1953             pagenum += 1
1954
1955         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1956         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1957         return [self.playlist_result(url_entries, playlist_title = username)]
1958
1959
1960 class DepositFilesIE(InfoExtractor):
1961     """Information extractor for depositfiles.com"""
1962
1963     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1964
1965     def _real_extract(self, url):
1966         file_id = url.split('/')[-1]
1967         # Rebuild url in english locale
1968         url = 'http://depositfiles.com/en/files/' + file_id
1969
1970         # Retrieve file webpage with 'Free download' button pressed
1971         free_download_indication = { 'gateway_result' : '1' }
1972         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1973         try:
1974             self.report_download_webpage(file_id)
1975             webpage = compat_urllib_request.urlopen(request).read()
1976         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1978             return
1979
1980         # Search for the real file URL
1981         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1982         if (mobj is None) or (mobj.group(1) is None):
1983             # Try to figure out reason of the error.
1984             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1985             if (mobj is not None) and (mobj.group(1) is not None):
1986                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1987                 self._downloader.report_error(u'%s' % restriction_message)
1988             else:
1989                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1990             return
1991
1992         file_url = mobj.group(1)
1993         file_extension = os.path.splitext(file_url)[1][1:]
1994
1995         # Search for file title
1996         mobj = re.search(r'<b title="(.*?)">', webpage)
1997         if mobj is None:
1998             self._downloader.report_error(u'unable to extract title')
1999             return
2000         file_title = mobj.group(1).decode('utf-8')
2001
2002         return [{
2003             'id':       file_id.decode('utf-8'),
2004             'url':      file_url.decode('utf-8'),
2005             'uploader': None,
2006             'upload_date':  None,
2007             'title':    file_title,
2008             'ext':      file_extension.decode('utf-8'),
2009         }]
2010
2011
2012 class FacebookIE(InfoExtractor):
2013     """Information Extractor for Facebook"""
2014
2015     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2016     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2017     _NETRC_MACHINE = 'facebook'
2018     IE_NAME = u'facebook'
2019
2020     def report_login(self):
2021         """Report attempt to log in."""
2022         self.to_screen(u'Logging in')
2023
2024     def _real_initialize(self):
2025         if self._downloader is None:
2026             return
2027
2028         useremail = None
2029         password = None
2030         downloader_params = self._downloader.params
2031
2032         # Attempt to use provided username and password or .netrc data
2033         if downloader_params.get('username', None) is not None:
2034             useremail = downloader_params['username']
2035             password = downloader_params['password']
2036         elif downloader_params.get('usenetrc', False):
2037             try:
2038                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2039                 if info is not None:
2040                     useremail = info[0]
2041                     password = info[2]
2042                 else:
2043                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2044             except (IOError, netrc.NetrcParseError) as err:
2045                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2046                 return
2047
2048         if useremail is None:
2049             return
2050
2051         # Log in
2052         login_form = {
2053             'email': useremail,
2054             'pass': password,
2055             'login': 'Log+In'
2056             }
2057         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2058         try:
2059             self.report_login()
2060             login_results = compat_urllib_request.urlopen(request).read()
2061             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2062                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2063                 return
2064         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2065             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2066             return
2067
2068     def _real_extract(self, url):
2069         mobj = re.match(self._VALID_URL, url)
2070         if mobj is None:
2071             self._downloader.report_error(u'invalid URL: %s' % url)
2072             return
2073         video_id = mobj.group('ID')
2074
2075         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2076         webpage = self._download_webpage(url, video_id)
2077
2078         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2079         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2080         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2081         if not m:
2082             raise ExtractorError(u'Cannot parse data')
2083         data = dict(json.loads(m.group(1)))
2084         params_raw = compat_urllib_parse.unquote(data['params'])
2085         params = json.loads(params_raw)
2086         video_data = params['video_data'][0]
2087         video_url = video_data.get('hd_src')
2088         if not video_url:
2089             video_url = video_data['sd_src']
2090         if not video_url:
2091             raise ExtractorError(u'Cannot find video URL')
2092         video_duration = int(video_data['video_duration'])
2093         thumbnail = video_data['thumbnail_src']
2094
2095         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2096         if not m:
2097             raise ExtractorError(u'Cannot find title in webpage')
2098         video_title = unescapeHTML(m.group(1))
2099
2100         info = {
2101             'id': video_id,
2102             'title': video_title,
2103             'url': video_url,
2104             'ext': 'mp4',
2105             'duration': video_duration,
2106             'thumbnail': thumbnail,
2107         }
2108         return [info]
2109
2110
2111 class BlipTVIE(InfoExtractor):
2112     """Information extractor for blip.tv"""
2113
2114     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2115     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2116     IE_NAME = u'blip.tv'
2117
2118     def report_direct_download(self, title):
2119         """Report information extraction."""
2120         self.to_screen(u'%s: Direct download detected' % title)
2121
2122     def _real_extract(self, url):
2123         mobj = re.match(self._VALID_URL, url)
2124         if mobj is None:
2125             self._downloader.report_error(u'invalid URL: %s' % url)
2126             return
2127
2128         urlp = compat_urllib_parse_urlparse(url)
2129         if urlp.path.startswith('/play/'):
2130             request = compat_urllib_request.Request(url)
2131             response = compat_urllib_request.urlopen(request)
2132             redirecturl = response.geturl()
2133             rurlp = compat_urllib_parse_urlparse(redirecturl)
2134             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2135             url = 'http://blip.tv/a/a-' + file_id
2136             return self._real_extract(url)
2137
2138
2139         if '?' in url:
2140             cchar = '&'
2141         else:
2142             cchar = '?'
2143         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2144         request = compat_urllib_request.Request(json_url)
2145         request.add_header('User-Agent', 'iTunes/10.6.1')
2146         self.report_extraction(mobj.group(1))
2147         info = None
2148         try:
2149             urlh = compat_urllib_request.urlopen(request)
2150             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2151                 basename = url.split('/')[-1]
2152                 title,ext = os.path.splitext(basename)
2153                 title = title.decode('UTF-8')
2154                 ext = ext.replace('.', '')
2155                 self.report_direct_download(title)
2156                 info = {
2157                     'id': title,
2158                     'url': url,
2159                     'uploader': None,
2160                     'upload_date': None,
2161                     'title': title,
2162                     'ext': ext,
2163                     'urlhandle': urlh
2164                 }
2165         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2166             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2167         if info is None: # Regular URL
2168             try:
2169                 json_code_bytes = urlh.read()
2170                 json_code = json_code_bytes.decode('utf-8')
2171             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2173                 return
2174
2175             try:
2176                 json_data = json.loads(json_code)
2177                 if 'Post' in json_data:
2178                     data = json_data['Post']
2179                 else:
2180                     data = json_data
2181
2182                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2183                 video_url = data['media']['url']
2184                 umobj = re.match(self._URL_EXT, video_url)
2185                 if umobj is None:
2186                     raise ValueError('Can not determine filename extension')
2187                 ext = umobj.group(1)
2188
2189                 info = {
2190                     'id': data['item_id'],
2191                     'url': video_url,
2192                     'uploader': data['display_name'],
2193                     'upload_date': upload_date,
2194                     'title': data['title'],
2195                     'ext': ext,
2196                     'format': data['media']['mimeType'],
2197                     'thumbnail': data['thumbnailUrl'],
2198                     'description': data['description'],
2199                     'player_url': data['embedUrl'],
2200                     'user_agent': 'iTunes/10.6.1',
2201                 }
2202             except (ValueError,KeyError) as err:
2203                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2204                 return
2205
2206         return [info]
2207
2208
2209 class MyVideoIE(InfoExtractor):
2210     """Information Extractor for myvideo.de."""
2211
2212     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2213     IE_NAME = u'myvideo'
2214
2215     def _real_extract(self,url):
2216         mobj = re.match(self._VALID_URL, url)
2217         if mobj is None:
2218             self._download.report_error(u'invalid URL: %s' % url)
2219             return
2220
2221         video_id = mobj.group(1)
2222
2223         # Get video webpage
2224         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2225         webpage = self._download_webpage(webpage_url, video_id)
2226
2227         self.report_extraction(video_id)
2228         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2229                  webpage)
2230         if mobj is None:
2231             self._downloader.report_error(u'unable to extract media URL')
2232             return
2233         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2234
2235         mobj = re.search('<title>([^<]+)</title>', webpage)
2236         if mobj is None:
2237             self._downloader.report_error(u'unable to extract title')
2238             return
2239
2240         video_title = mobj.group(1)
2241
2242         return [{
2243             'id':       video_id,
2244             'url':      video_url,
2245             'uploader': None,
2246             'upload_date':  None,
2247             'title':    video_title,
2248             'ext':      u'flv',
2249         }]
2250
2251 class ComedyCentralIE(InfoExtractor):
2252     """Information extractor for The Daily Show and Colbert Report """
2253
2254     # urls can be abbreviations like :thedailyshow or :colbert
2255     # urls for episodes like:
2256     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2257     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2258     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2259     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2260                       |(https?://)?(www\.)?
2261                           (?P<showname>thedailyshow|colbertnation)\.com/
2262                          (full-episodes/(?P<episode>.*)|
2263                           (?P<clip>
2264                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2265                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2266                      $"""
2267
2268     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2269
2270     _video_extensions = {
2271         '3500': 'mp4',
2272         '2200': 'mp4',
2273         '1700': 'mp4',
2274         '1200': 'mp4',
2275         '750': 'mp4',
2276         '400': 'mp4',
2277     }
2278     _video_dimensions = {
2279         '3500': '1280x720',
2280         '2200': '960x540',
2281         '1700': '768x432',
2282         '1200': '640x360',
2283         '750': '512x288',
2284         '400': '384x216',
2285     }
2286
2287     @classmethod
2288     def suitable(cls, url):
2289         """Receives a URL and returns True if suitable for this IE."""
2290         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2291
2292     def report_config_download(self, episode_id, media_id):
2293         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2294
2295     def report_index_download(self, episode_id):
2296         self.to_screen(u'%s: Downloading show index' % episode_id)
2297
2298     def _print_formats(self, formats):
2299         print('Available formats:')
2300         for x in formats:
2301             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2302
2303
2304     def _real_extract(self, url):
2305         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2306         if mobj is None:
2307             self._downloader.report_error(u'invalid URL: %s' % url)
2308             return
2309
2310         if mobj.group('shortname'):
2311             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2312                 url = u'http://www.thedailyshow.com/full-episodes/'
2313             else:
2314                 url = u'http://www.colbertnation.com/full-episodes/'
2315             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2316             assert mobj is not None
2317
2318         if mobj.group('clip'):
2319             if mobj.group('showname') == 'thedailyshow':
2320                 epTitle = mobj.group('tdstitle')
2321             else:
2322                 epTitle = mobj.group('cntitle')
2323             dlNewest = False
2324         else:
2325             dlNewest = not mobj.group('episode')
2326             if dlNewest:
2327                 epTitle = mobj.group('showname')
2328             else:
2329                 epTitle = mobj.group('episode')
2330
2331         req = compat_urllib_request.Request(url)
2332         self.report_extraction(epTitle)
2333         try:
2334             htmlHandle = compat_urllib_request.urlopen(req)
2335             html = htmlHandle.read()
2336             webpage = html.decode('utf-8')
2337         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2338             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2339             return
2340         if dlNewest:
2341             url = htmlHandle.geturl()
2342             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2343             if mobj is None:
2344                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2345                 return
2346             if mobj.group('episode') == '':
2347                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2348                 return
2349             epTitle = mobj.group('episode')
2350
2351         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2352
2353         if len(mMovieParams) == 0:
2354             # The Colbert Report embeds the information in a without
2355             # a URL prefix; so extract the alternate reference
2356             # and then add the URL prefix manually.
2357
2358             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2359             if len(altMovieParams) == 0:
2360                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2361                 return
2362             else:
2363                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2364
2365         uri = mMovieParams[0][1]
2366         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2367         self.report_index_download(epTitle)
2368         try:
2369             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2371             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2372             return
2373
2374         results = []
2375
2376         idoc = xml.etree.ElementTree.fromstring(indexXml)
2377         itemEls = idoc.findall('.//item')
2378         for partNum,itemEl in enumerate(itemEls):
2379             mediaId = itemEl.findall('./guid')[0].text
2380             shortMediaId = mediaId.split(':')[-1]
2381             showId = mediaId.split(':')[-2].replace('.com', '')
2382             officialTitle = itemEl.findall('./title')[0].text
2383             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2384
2385             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2386                         compat_urllib_parse.urlencode({'uri': mediaId}))
2387             configReq = compat_urllib_request.Request(configUrl)
2388             self.report_config_download(epTitle, shortMediaId)
2389             try:
2390                 configXml = compat_urllib_request.urlopen(configReq).read()
2391             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2392                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2393                 return
2394
2395             cdoc = xml.etree.ElementTree.fromstring(configXml)
2396             turls = []
2397             for rendition in cdoc.findall('.//rendition'):
2398                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2399                 turls.append(finfo)
2400
2401             if len(turls) == 0:
2402                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2403                 continue
2404
2405             if self._downloader.params.get('listformats', None):
2406                 self._print_formats([i[0] for i in turls])
2407                 return
2408
2409             # For now, just pick the highest bitrate
2410             format,rtmp_video_url = turls[-1]
2411
2412             # Get the format arg from the arg stream
2413             req_format = self._downloader.params.get('format', None)
2414
2415             # Select format if we can find one
2416             for f,v in turls:
2417                 if f == req_format:
2418                     format, rtmp_video_url = f, v
2419                     break
2420
2421             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2422             if not m:
2423                 raise ExtractorError(u'Cannot transform RTMP url')
2424             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2425             video_url = base + m.group('finalid')
2426
2427             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2428             info = {
2429                 'id': shortMediaId,
2430                 'url': video_url,
2431                 'uploader': showId,
2432                 'upload_date': officialDate,
2433                 'title': effTitle,
2434                 'ext': 'mp4',
2435                 'format': format,
2436                 'thumbnail': None,
2437                 'description': officialTitle,
2438             }
2439             results.append(info)
2440
2441         return results
2442
2443
2444 class EscapistIE(InfoExtractor):
2445     """Information extractor for The Escapist """
2446
2447     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2448     IE_NAME = u'escapist'
2449
2450     def report_config_download(self, showName):
2451         self.to_screen(u'%s: Downloading configuration' % showName)
2452
2453     def _real_extract(self, url):
2454         mobj = re.match(self._VALID_URL, url)
2455         if mobj is None:
2456             self._downloader.report_error(u'invalid URL: %s' % url)
2457             return
2458         showName = mobj.group('showname')
2459         videoId = mobj.group('episode')
2460
2461         self.report_extraction(showName)
2462         try:
2463             webPage = compat_urllib_request.urlopen(url)
2464             webPageBytes = webPage.read()
2465             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2466             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2467         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2469             return
2470
2471         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2472         description = unescapeHTML(descMatch.group(1))
2473         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2474         imgUrl = unescapeHTML(imgMatch.group(1))
2475         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2476         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2477         configUrlMatch = re.search('config=(.*)$', playerUrl)
2478         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2479
2480         self.report_config_download(showName)
2481         try:
2482             configJSON = compat_urllib_request.urlopen(configUrl)
2483             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2484             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2485         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2487             return
2488
2489         # Technically, it's JavaScript, not JSON
2490         configJSON = configJSON.replace("'", '"')
2491
2492         try:
2493             config = json.loads(configJSON)
2494         except (ValueError,) as err:
2495             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2496             return
2497
2498         playlist = config['playlist']
2499         videoUrl = playlist[1]['url']
2500
2501         info = {
2502             'id': videoId,
2503             'url': videoUrl,
2504             'uploader': showName,
2505             'upload_date': None,
2506             'title': showName,
2507             'ext': 'mp4',
2508             'thumbnail': imgUrl,
2509             'description': description,
2510             'player_url': playerUrl,
2511         }
2512
2513         return [info]
2514
2515 class CollegeHumorIE(InfoExtractor):
2516     """Information extractor for collegehumor.com"""
2517
2518     _WORKING = False
2519     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2520     IE_NAME = u'collegehumor'
2521
2522     def report_manifest(self, video_id):
2523         """Report information extraction."""
2524         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2525
2526     def _real_extract(self, url):
2527         mobj = re.match(self._VALID_URL, url)
2528         if mobj is None:
2529             self._downloader.report_error(u'invalid URL: %s' % url)
2530             return
2531         video_id = mobj.group('videoid')
2532
2533         info = {
2534             'id': video_id,
2535             'uploader': None,
2536             'upload_date': None,
2537         }
2538
2539         self.report_extraction(video_id)
2540         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2541         try:
2542             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2543         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2544             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2545             return
2546
2547         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2548         try:
2549             videoNode = mdoc.findall('./video')[0]
2550             info['description'] = videoNode.findall('./description')[0].text
2551             info['title'] = videoNode.findall('./caption')[0].text
2552             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2553             manifest_url = videoNode.findall('./file')[0].text
2554         except IndexError:
2555             self._downloader.report_error(u'Invalid metadata XML file')
2556             return
2557
2558         manifest_url += '?hdcore=2.10.3'
2559         self.report_manifest(video_id)
2560         try:
2561             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2562         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2564             return
2565
2566         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2567         try:
2568             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2569             node_id = media_node.attrib['url']
2570             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2571         except IndexError as err:
2572             self._downloader.report_error(u'Invalid manifest file')
2573             return
2574
2575         url_pr = compat_urllib_parse_urlparse(manifest_url)
2576         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2577
2578         info['url'] = url
2579         info['ext'] = 'f4f'
2580         return [info]
2581
2582
2583 class XVideosIE(InfoExtractor):
2584     """Information extractor for xvideos.com"""
2585
2586     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2587     IE_NAME = u'xvideos'
2588
2589     def _real_extract(self, url):
2590         mobj = re.match(self._VALID_URL, url)
2591         if mobj is None:
2592             self._downloader.report_error(u'invalid URL: %s' % url)
2593             return
2594         video_id = mobj.group(1)
2595
2596         webpage = self._download_webpage(url, video_id)
2597
2598         self.report_extraction(video_id)
2599
2600
2601         # Extract video URL
2602         mobj = re.search(r'flv_url=(.+?)&', webpage)
2603         if mobj is None:
2604             self._downloader.report_error(u'unable to extract video url')
2605             return
2606         video_url = compat_urllib_parse.unquote(mobj.group(1))
2607
2608
2609         # Extract title
2610         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2611         if mobj is None:
2612             self._downloader.report_error(u'unable to extract video title')
2613             return
2614         video_title = mobj.group(1)
2615
2616
2617         # Extract video thumbnail
2618         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2619         if mobj is None:
2620             self._downloader.report_error(u'unable to extract video thumbnail')
2621             return
2622         video_thumbnail = mobj.group(0)
2623
2624         info = {
2625             'id': video_id,
2626             'url': video_url,
2627             'uploader': None,
2628             'upload_date': None,
2629             'title': video_title,
2630             'ext': 'flv',
2631             'thumbnail': video_thumbnail,
2632             'description': None,
2633         }
2634
2635         return [info]
2636
2637
2638 class SoundcloudIE(InfoExtractor):
2639     """Information extractor for soundcloud.com
2640        To access the media, the uid of the song and a stream token
2641        must be extracted from the page source and the script must make
2642        a request to media.soundcloud.com/crossdomain.xml. Then
2643        the media can be grabbed by requesting from an url composed
2644        of the stream token and uid
2645      """
2646
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648     IE_NAME = u'soundcloud'
2649
2650     def report_resolve(self, video_id):
2651         """Report information extraction."""
2652         self.to_screen(u'%s: Resolving id' % video_id)
2653
2654     def _real_extract(self, url):
2655         mobj = re.match(self._VALID_URL, url)
2656         if mobj is None:
2657             self._downloader.report_error(u'invalid URL: %s' % url)
2658             return
2659
2660         # extract uploader (which is in the url)
2661         uploader = mobj.group(1)
2662         # extract simple title (uploader + slug of song title)
2663         slug_title =  mobj.group(2)
2664         simple_title = uploader + u'-' + slug_title
2665
2666         self.report_resolve('%s/%s' % (uploader, slug_title))
2667
2668         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2669         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2670         request = compat_urllib_request.Request(resolv_url)
2671         try:
2672             info_json_bytes = compat_urllib_request.urlopen(request).read()
2673             info_json = info_json_bytes.decode('utf-8')
2674         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2675             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2676             return
2677
2678         info = json.loads(info_json)
2679         video_id = info['id']
2680         self.report_extraction('%s/%s' % (uploader, slug_title))
2681
2682         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2683         request = compat_urllib_request.Request(streams_url)
2684         try:
2685             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2686             stream_json = stream_json_bytes.decode('utf-8')
2687         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2688             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2689             return
2690
2691         streams = json.loads(stream_json)
2692         mediaURL = streams['http_mp3_128_url']
2693         upload_date = unified_strdate(info['created_at'])
2694
2695         return [{
2696             'id':       info['id'],
2697             'url':      mediaURL,
2698             'uploader': info['user']['username'],
2699             'upload_date': upload_date,
2700             'title':    info['title'],
2701             'ext':      u'mp3',
2702             'description': info['description'],
2703         }]
2704
2705 class SoundcloudSetIE(InfoExtractor):
2706     """Information extractor for soundcloud.com sets
2707        To access the media, the uid of the song and a stream token
2708        must be extracted from the page source and the script must make
2709        a request to media.soundcloud.com/crossdomain.xml. Then
2710        the media can be grabbed by requesting from an url composed
2711        of the stream token and uid
2712      """
2713
2714     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2715     IE_NAME = u'soundcloud:set'
2716
2717     def report_resolve(self, video_id):
2718         """Report information extraction."""
2719         self.to_screen(u'%s: Resolving id' % video_id)
2720
2721     def _real_extract(self, url):
2722         mobj = re.match(self._VALID_URL, url)
2723         if mobj is None:
2724             self._downloader.report_error(u'invalid URL: %s' % url)
2725             return
2726
2727         # extract uploader (which is in the url)
2728         uploader = mobj.group(1)
2729         # extract simple title (uploader + slug of song title)
2730         slug_title =  mobj.group(2)
2731         simple_title = uploader + u'-' + slug_title
2732
2733         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2734
2735         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2736         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737         request = compat_urllib_request.Request(resolv_url)
2738         try:
2739             info_json_bytes = compat_urllib_request.urlopen(request).read()
2740             info_json = info_json_bytes.decode('utf-8')
2741         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2743             return
2744
2745         videos = []
2746         info = json.loads(info_json)
2747         if 'errors' in info:
2748             for err in info['errors']:
2749                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2750             return
2751
2752         for track in info['tracks']:
2753             video_id = track['id']
2754             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2755
2756             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2757             request = compat_urllib_request.Request(streams_url)
2758             try:
2759                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2760                 stream_json = stream_json_bytes.decode('utf-8')
2761             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2762                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2763                 return
2764
2765             streams = json.loads(stream_json)
2766             mediaURL = streams['http_mp3_128_url']
2767
2768             videos.append({
2769                 'id':       video_id,
2770                 'url':      mediaURL,
2771                 'uploader': track['user']['username'],
2772                 'upload_date':  track['created_at'],
2773                 'title':    track['title'],
2774                 'ext':      u'mp3',
2775                 'description': track['description'],
2776             })
2777         return videos
2778
2779
2780 class InfoQIE(InfoExtractor):
2781     """Information extractor for infoq.com"""
2782     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2783
2784     def _real_extract(self, url):
2785         mobj = re.match(self._VALID_URL, url)
2786         if mobj is None:
2787             self._downloader.report_error(u'invalid URL: %s' % url)
2788             return
2789
2790         webpage = self._download_webpage(url, video_id=url)
2791         self.report_extraction(url)
2792
2793         # Extract video URL
2794         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2795         if mobj is None:
2796             self._downloader.report_error(u'unable to extract video url')
2797             return
2798         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2799         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2800
2801         # Extract title
2802         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2803         if mobj is None:
2804             self._downloader.report_error(u'unable to extract video title')
2805             return
2806         video_title = mobj.group(1)
2807
2808         # Extract description
2809         video_description = u'No description available.'
2810         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2811         if mobj is not None:
2812             video_description = mobj.group(1)
2813
2814         video_filename = video_url.split('/')[-1]
2815         video_id, extension = video_filename.split('.')
2816
2817         info = {
2818             'id': video_id,
2819             'url': video_url,
2820             'uploader': None,
2821             'upload_date': None,
2822             'title': video_title,
2823             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2824             'thumbnail': None,
2825             'description': video_description,
2826         }
2827
2828         return [info]
2829
2830 class MixcloudIE(InfoExtractor):
2831     """Information extractor for www.mixcloud.com"""
2832
2833     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2834     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2835     IE_NAME = u'mixcloud'
2836
2837     def report_download_json(self, file_id):
2838         """Report JSON download."""
2839         self.to_screen(u'Downloading json')
2840
2841     def get_urls(self, jsonData, fmt, bitrate='best'):
2842         """Get urls from 'audio_formats' section in json"""
2843         file_url = None
2844         try:
2845             bitrate_list = jsonData[fmt]
2846             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2847                 bitrate = max(bitrate_list) # select highest
2848
2849             url_list = jsonData[fmt][bitrate]
2850         except TypeError: # we have no bitrate info.
2851             url_list = jsonData[fmt]
2852         return url_list
2853
2854     def check_urls(self, url_list):
2855         """Returns 1st active url from list"""
2856         for url in url_list:
2857             try:
2858                 compat_urllib_request.urlopen(url)
2859                 return url
2860             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861                 url = None
2862
2863         return None
2864
2865     def _print_formats(self, formats):
2866         print('Available formats:')
2867         for fmt in formats.keys():
2868             for b in formats[fmt]:
2869                 try:
2870                     ext = formats[fmt][b][0]
2871                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2872                 except TypeError: # we have no bitrate info
2873                     ext = formats[fmt][0]
2874                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2875                     break
2876
2877     def _real_extract(self, url):
2878         mobj = re.match(self._VALID_URL, url)
2879         if mobj is None:
2880             self._downloader.report_error(u'invalid URL: %s' % url)
2881             return
2882         # extract uploader & filename from url
2883         uploader = mobj.group(1).decode('utf-8')
2884         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2885
2886         # construct API request
2887         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2888         # retrieve .json file with links to files
2889         request = compat_urllib_request.Request(file_url)
2890         try:
2891             self.report_download_json(file_url)
2892             jsonData = compat_urllib_request.urlopen(request).read()
2893         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2895             return
2896
2897         # parse JSON
2898         json_data = json.loads(jsonData)
2899         player_url = json_data['player_swf_url']
2900         formats = dict(json_data['audio_formats'])
2901
2902         req_format = self._downloader.params.get('format', None)
2903         bitrate = None
2904
2905         if self._downloader.params.get('listformats', None):
2906             self._print_formats(formats)
2907             return
2908
2909         if req_format is None or req_format == 'best':
2910             for format_param in formats.keys():
2911                 url_list = self.get_urls(formats, format_param)
2912                 # check urls
2913                 file_url = self.check_urls(url_list)
2914                 if file_url is not None:
2915                     break # got it!
2916         else:
2917             if req_format not in formats:
2918                 self._downloader.report_error(u'format is not available')
2919                 return
2920
2921             url_list = self.get_urls(formats, req_format)
2922             file_url = self.check_urls(url_list)
2923             format_param = req_format
2924
2925         return [{
2926             'id': file_id.decode('utf-8'),
2927             'url': file_url.decode('utf-8'),
2928             'uploader': uploader.decode('utf-8'),
2929             'upload_date': None,
2930             'title': json_data['name'],
2931             'ext': file_url.split('.')[-1].decode('utf-8'),
2932             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2933             'thumbnail': json_data['thumbnail_url'],
2934             'description': json_data['description'],
2935             'player_url': player_url.decode('utf-8'),
2936         }]
2937
2938 class StanfordOpenClassroomIE(InfoExtractor):
2939     """Information extractor for Stanford's Open ClassRoom"""
2940
2941     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2942     IE_NAME = u'stanfordoc'
2943
2944     def _real_extract(self, url):
2945         mobj = re.match(self._VALID_URL, url)
2946         if mobj is None:
2947             raise ExtractorError(u'Invalid URL: %s' % url)
2948
2949         if mobj.group('course') and mobj.group('video'): # A specific video
2950             course = mobj.group('course')
2951             video = mobj.group('video')
2952             info = {
2953                 'id': course + '_' + video,
2954                 'uploader': None,
2955                 'upload_date': None,
2956             }
2957
2958             self.report_extraction(info['id'])
2959             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2960             xmlUrl = baseUrl + video + '.xml'
2961             try:
2962                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2963             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2964                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2965                 return
2966             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2967             try:
2968                 info['title'] = mdoc.findall('./title')[0].text
2969                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2970             except IndexError:
2971                 self._downloader.report_error(u'Invalid metadata XML file')
2972                 return
2973             info['ext'] = info['url'].rpartition('.')[2]
2974             return [info]
2975         elif mobj.group('course'): # A course page
2976             course = mobj.group('course')
2977             info = {
2978                 'id': course,
2979                 'type': 'playlist',
2980                 'uploader': None,
2981                 'upload_date': None,
2982             }
2983
2984             coursepage = self._download_webpage(url, info['id'],
2985                                         note='Downloading course info page',
2986                                         errnote='Unable to download course info page')
2987
2988             m = re.search('<h1>([^<]+)</h1>', coursepage)
2989             if m:
2990                 info['title'] = unescapeHTML(m.group(1))
2991             else:
2992                 info['title'] = info['id']
2993
2994             m = re.search('<description>([^<]+)</description>', coursepage)
2995             if m:
2996                 info['description'] = unescapeHTML(m.group(1))
2997
2998             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2999             info['list'] = [
3000                 {
3001                     'type': 'reference',
3002                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3003                 }
3004                     for vpage in links]
3005             results = []
3006             for entry in info['list']:
3007                 assert entry['type'] == 'reference'
3008                 results += self.extract(entry['url'])
3009             return results
3010         else: # Root page
3011             info = {
3012                 'id': 'Stanford OpenClassroom',
3013                 'type': 'playlist',
3014                 'uploader': None,
3015                 'upload_date': None,
3016             }
3017
3018             self.report_download_webpage(info['id'])
3019             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3020             try:
3021                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3022             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3023                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3024                 return
3025
3026             info['title'] = info['id']
3027
3028             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3029             info['list'] = [
3030                 {
3031                     'type': 'reference',
3032                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3033                 }
3034                     for cpage in links]
3035
3036             results = []
3037             for entry in info['list']:
3038                 assert entry['type'] == 'reference'
3039                 results += self.extract(entry['url'])
3040             return results
3041
3042 class MTVIE(InfoExtractor):
3043     """Information extractor for MTV.com"""
3044
3045     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3046     IE_NAME = u'mtv'
3047
3048     def _real_extract(self, url):
3049         mobj = re.match(self._VALID_URL, url)
3050         if mobj is None:
3051             self._downloader.report_error(u'invalid URL: %s' % url)
3052             return
3053         if not mobj.group('proto'):
3054             url = 'http://' + url
3055         video_id = mobj.group('videoid')
3056
3057         webpage = self._download_webpage(url, video_id)
3058
3059         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3060         if mobj is None:
3061             self._downloader.report_error(u'unable to extract song name')
3062             return
3063         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3064         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3065         if mobj is None:
3066             self._downloader.report_error(u'unable to extract performer')
3067             return
3068         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3069         video_title = performer + ' - ' + song_name
3070
3071         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3072         if mobj is None:
3073             self._downloader.report_error(u'unable to mtvn_uri')
3074             return
3075         mtvn_uri = mobj.group(1)
3076
3077         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3078         if mobj is None:
3079             self._downloader.report_error(u'unable to extract content id')
3080             return
3081         content_id = mobj.group(1)
3082
3083         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3084         self.report_extraction(video_id)
3085         request = compat_urllib_request.Request(videogen_url)
3086         try:
3087             metadataXml = compat_urllib_request.urlopen(request).read()
3088         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3089             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3090             return
3091
3092         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3093         renditions = mdoc.findall('.//rendition')
3094
3095         # For now, always pick the highest quality.
3096         rendition = renditions[-1]
3097
3098         try:
3099             _,_,ext = rendition.attrib['type'].partition('/')
3100             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3101             video_url = rendition.find('./src').text
3102         except KeyError:
3103             self._downloader.report_error('Invalid rendition field.')
3104             return
3105
3106         info = {
3107             'id': video_id,
3108             'url': video_url,
3109             'uploader': performer,
3110             'upload_date': None,
3111             'title': video_title,
3112             'ext': ext,
3113             'format': format,
3114         }
3115
3116         return [info]
3117
3118
3119 class YoukuIE(InfoExtractor):
3120     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3121
3122     def _gen_sid(self):
3123         nowTime = int(time.time() * 1000)
3124         random1 = random.randint(1000,1998)
3125         random2 = random.randint(1000,9999)
3126
3127         return "%d%d%d" %(nowTime,random1,random2)
3128
3129     def _get_file_ID_mix_string(self, seed):
3130         mixed = []
3131         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3132         seed = float(seed)
3133         for i in range(len(source)):
3134             seed  =  (seed * 211 + 30031 ) % 65536
3135             index  =  math.floor(seed / 65536 * len(source) )
3136             mixed.append(source[int(index)])
3137             source.remove(source[int(index)])
3138         #return ''.join(mixed)
3139         return mixed
3140
3141     def _get_file_id(self, fileId, seed):
3142         mixed = self._get_file_ID_mix_string(seed)
3143         ids = fileId.split('*')
3144         realId = []
3145         for ch in ids:
3146             if ch:
3147                 realId.append(mixed[int(ch)])
3148         return ''.join(realId)
3149
3150     def _real_extract(self, url):
3151         mobj = re.match(self._VALID_URL, url)
3152         if mobj is None:
3153             self._downloader.report_error(u'invalid URL: %s' % url)
3154             return
3155         video_id = mobj.group('ID')
3156
3157         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3158
3159         request = compat_urllib_request.Request(info_url, None, std_headers)
3160         try:
3161             self.report_download_webpage(video_id)
3162             jsondata = compat_urllib_request.urlopen(request).read()
3163         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3164             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3165             return
3166
3167         self.report_extraction(video_id)
3168         try:
3169             jsonstr = jsondata.decode('utf-8')
3170             config = json.loads(jsonstr)
3171
3172             video_title =  config['data'][0]['title']
3173             seed = config['data'][0]['seed']
3174
3175             format = self._downloader.params.get('format', None)
3176             supported_format = list(config['data'][0]['streamfileids'].keys())
3177
3178             if format is None or format == 'best':
3179                 if 'hd2' in supported_format:
3180                     format = 'hd2'
3181                 else:
3182                     format = 'flv'
3183                 ext = u'flv'
3184             elif format == 'worst':
3185                 format = 'mp4'
3186                 ext = u'mp4'
3187             else:
3188                 format = 'flv'
3189                 ext = u'flv'
3190
3191
3192             fileid = config['data'][0]['streamfileids'][format]
3193             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3194         except (UnicodeDecodeError, ValueError, KeyError):
3195             self._downloader.report_error(u'unable to extract info section')
3196             return
3197
3198         files_info=[]
3199         sid = self._gen_sid()
3200         fileid = self._get_file_id(fileid, seed)
3201
3202         #column 8,9 of fileid represent the segment number
3203         #fileid[7:9] should be changed
3204         for index, key in enumerate(keys):
3205
3206             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3207             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3208
3209             info = {
3210                 'id': '%s_part%02d' % (video_id, index),
3211                 'url': download_url,
3212                 'uploader': None,
3213                 'upload_date': None,
3214                 'title': video_title,
3215                 'ext': ext,
3216             }
3217             files_info.append(info)
3218
3219         return files_info
3220
3221
3222 class XNXXIE(InfoExtractor):
3223     """Information extractor for xnxx.com"""
3224
3225     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3226     IE_NAME = u'xnxx'
3227     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3228     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3229     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3230
3231     def _real_extract(self, url):
3232         mobj = re.match(self._VALID_URL, url)
3233         if mobj is None:
3234             self._downloader.report_error(u'invalid URL: %s' % url)
3235             return
3236         video_id = mobj.group(1)
3237
3238         self.report_download_webpage(video_id)
3239
3240         # Get webpage content
3241         try:
3242             webpage_bytes = compat_urllib_request.urlopen(url).read()
3243             webpage = webpage_bytes.decode('utf-8')
3244         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3245             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3246             return
3247
3248         result = re.search(self.VIDEO_URL_RE, webpage)
3249         if result is None:
3250             self._downloader.report_error(u'unable to extract video url')
3251             return
3252         video_url = compat_urllib_parse.unquote(result.group(1))
3253
3254         result = re.search(self.VIDEO_TITLE_RE, webpage)
3255         if result is None:
3256             self._downloader.report_error(u'unable to extract video title')
3257             return
3258         video_title = result.group(1)
3259
3260         result = re.search(self.VIDEO_THUMB_RE, webpage)
3261         if result is None:
3262             self._downloader.report_error(u'unable to extract video thumbnail')
3263             return
3264         video_thumbnail = result.group(1)
3265
3266         return [{
3267             'id': video_id,
3268             'url': video_url,
3269             'uploader': None,
3270             'upload_date': None,
3271             'title': video_title,
3272             'ext': 'flv',
3273             'thumbnail': video_thumbnail,
3274             'description': None,
3275         }]
3276
3277
3278 class GooglePlusIE(InfoExtractor):
3279     """Information extractor for plus.google.com."""
3280
3281     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3282     IE_NAME = u'plus.google'
3283
3284     def report_extract_entry(self, url):
3285         """Report downloading extry"""
3286         self.to_screen(u'Downloading entry: %s' % url)
3287
3288     def report_date(self, upload_date):
3289         """Report downloading extry"""
3290         self.to_screen(u'Entry date: %s' % upload_date)
3291
3292     def report_uploader(self, uploader):
3293         """Report downloading extry"""
3294         self.to_screen(u'Uploader: %s' % uploader)
3295
3296     def report_title(self, video_title):
3297         """Report downloading extry"""
3298         self.to_screen(u'Title: %s' % video_title)
3299
3300     def report_extract_vid_page(self, video_page):
3301         """Report information extraction."""
3302         self.to_screen(u'Extracting video page: %s' % video_page)
3303
3304     def _real_extract(self, url):
3305         # Extract id from URL
3306         mobj = re.match(self._VALID_URL, url)
3307         if mobj is None:
3308             self._downloader.report_error(u'Invalid URL: %s' % url)
3309             return
3310
3311         post_url = mobj.group(0)
3312         video_id = mobj.group(1)
3313
3314         video_extension = 'flv'
3315
3316         # Step 1, Retrieve post webpage to extract further information
3317         self.report_extract_entry(post_url)
3318         request = compat_urllib_request.Request(post_url)
3319         try:
3320             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3321         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3323             return
3324
3325         # Extract update date
3326         upload_date = None
3327         pattern = 'title="Timestamp">(.*?)</a>'
3328         mobj = re.search(pattern, webpage)
3329         if mobj:
3330             upload_date = mobj.group(1)
3331             # Convert timestring to a format suitable for filename
3332             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3333             upload_date = upload_date.strftime('%Y%m%d')
3334         self.report_date(upload_date)
3335
3336         # Extract uploader
3337         uploader = None
3338         pattern = r'rel\="author".*?>(.*?)</a>'
3339         mobj = re.search(pattern, webpage)
3340         if mobj:
3341             uploader = mobj.group(1)
3342         self.report_uploader(uploader)
3343
3344         # Extract title
3345         # Get the first line for title
3346         video_title = u'NA'
3347         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3348         mobj = re.search(pattern, webpage)
3349         if mobj:
3350             video_title = mobj.group(1)
3351         self.report_title(video_title)
3352
3353         # Step 2, Stimulate clicking the image box to launch video
3354         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3355         mobj = re.search(pattern, webpage)
3356         if mobj is None:
3357             self._downloader.report_error(u'unable to extract video page URL')
3358
3359         video_page = mobj.group(1)
3360         request = compat_urllib_request.Request(video_page)
3361         try:
3362             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3364             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3365             return
3366         self.report_extract_vid_page(video_page)
3367
3368
3369         # Extract video links on video page
3370         """Extract video links of all sizes"""
3371         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3372         mobj = re.findall(pattern, webpage)
3373         if len(mobj) == 0:
3374             self._downloader.report_error(u'unable to extract video links')
3375
3376         # Sort in resolution
3377         links = sorted(mobj)
3378
3379         # Choose the lowest of the sort, i.e. highest resolution
3380         video_url = links[-1]
3381         # Only get the url. The resolution part in the tuple has no use anymore
3382         video_url = video_url[-1]
3383         # Treat escaped \u0026 style hex
3384         try:
3385             video_url = video_url.decode("unicode_escape")
3386         except AttributeError: # Python 3
3387             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3388
3389
3390         return [{
3391             'id':       video_id,
3392             'url':      video_url,
3393             'uploader': uploader,
3394             'upload_date':  upload_date,
3395             'title':    video_title,
3396             'ext':      video_extension,
3397         }]
3398
3399 class NBAIE(InfoExtractor):
3400     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3401     IE_NAME = u'nba'
3402
3403     def _real_extract(self, url):
3404         mobj = re.match(self._VALID_URL, url)
3405         if mobj is None:
3406             self._downloader.report_error(u'invalid URL: %s' % url)
3407             return
3408
3409         video_id = mobj.group(1)
3410         if video_id.endswith('/index.html'):
3411             video_id = video_id[:-len('/index.html')]
3412
3413         webpage = self._download_webpage(url, video_id)
3414
3415         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3416         def _findProp(rexp, default=None):
3417             m = re.search(rexp, webpage)
3418             if m:
3419                 return unescapeHTML(m.group(1))
3420             else:
3421                 return default
3422
3423         shortened_video_id = video_id.rpartition('/')[2]
3424         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3425         info = {
3426             'id': shortened_video_id,
3427             'url': video_url,
3428             'ext': 'mp4',
3429             'title': title,
3430             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3431             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3432         }
3433         return [info]
3434
3435 class JustinTVIE(InfoExtractor):
3436     """Information extractor for justin.tv and twitch.tv"""
3437     # TODO: One broadcast may be split into multiple videos. The key
3438     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3439     # starts at 1 and increases. Can we treat all parts as one video?
3440
3441     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3442         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3443     _JUSTIN_PAGE_LIMIT = 100
3444     IE_NAME = u'justin.tv'
3445
3446     def report_download_page(self, channel, offset):
3447         """Report attempt to download a single page of videos."""
3448         self.to_screen(u'%s: Downloading video information from %d to %d' %
3449                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3450
3451     # Return count of items, list of *valid* items
3452     def _parse_page(self, url):
3453         try:
3454             urlh = compat_urllib_request.urlopen(url)
3455             webpage_bytes = urlh.read()
3456             webpage = webpage_bytes.decode('utf-8', 'ignore')
3457         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3458             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3459             return
3460
3461         response = json.loads(webpage)
3462         if type(response) != list:
3463             error_text = response.get('error', 'unknown error')
3464             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3465             return
3466         info = []
3467         for clip in response:
3468             video_url = clip['video_file_url']
3469             if video_url:
3470                 video_extension = os.path.splitext(video_url)[1][1:]
3471                 video_date = re.sub('-', '', clip['start_time'][:10])
3472                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3473                 video_id = clip['id']
3474                 video_title = clip.get('title', video_id)
3475                 info.append({
3476                     'id': video_id,
3477                     'url': video_url,
3478                     'title': video_title,
3479                     'uploader': clip.get('channel_name', video_uploader_id),
3480                     'uploader_id': video_uploader_id,
3481                     'upload_date': video_date,
3482                     'ext': video_extension,
3483                 })
3484         return (len(response), info)
3485
3486     def _real_extract(self, url):
3487         mobj = re.match(self._VALID_URL, url)
3488         if mobj is None:
3489             self._downloader.report_error(u'invalid URL: %s' % url)
3490             return
3491
3492         api = 'http://api.justin.tv'
3493         video_id = mobj.group(mobj.lastindex)
3494         paged = False
3495         if mobj.lastindex == 1:
3496             paged = True
3497             api += '/channel/archives/%s.json'
3498         else:
3499             api += '/broadcast/by_archive/%s.json'
3500         api = api % (video_id,)
3501
3502         self.report_extraction(video_id)
3503
3504         info = []
3505         offset = 0
3506         limit = self._JUSTIN_PAGE_LIMIT
3507         while True:
3508             if paged:
3509                 self.report_download_page(video_id, offset)
3510             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3511             page_count, page_info = self._parse_page(page_url)
3512             info.extend(page_info)
3513             if not paged or page_count != limit:
3514                 break
3515             offset += limit
3516         return info
3517
3518 class FunnyOrDieIE(InfoExtractor):
3519     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3520
3521     def _real_extract(self, url):
3522         mobj = re.match(self._VALID_URL, url)
3523         if mobj is None:
3524             self._downloader.report_error(u'invalid URL: %s' % url)
3525             return
3526
3527         video_id = mobj.group('id')
3528         webpage = self._download_webpage(url, video_id)
3529
3530         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3531         if not m:
3532             self._downloader.report_error(u'unable to find video information')
3533         video_url = unescapeHTML(m.group('url'))
3534
3535         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3536         if not m:
3537             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3538             if not m:
3539                 self._downloader.report_error(u'Cannot find video title')
3540         title = clean_html(m.group('title'))
3541
3542         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3543         if m:
3544             desc = unescapeHTML(m.group('desc'))
3545         else:
3546             desc = None
3547
3548         info = {
3549             'id': video_id,
3550             'url': video_url,
3551             'ext': 'mp4',
3552             'title': title,
3553             'description': desc,
3554         }
3555         return [info]
3556
3557 class SteamIE(InfoExtractor):
3558     _VALID_URL = r"""http://store.steampowered.com/
3559                 (agecheck/)?
3560                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3561                 (?P<gameID>\d+)/?
3562                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3563                 """
3564
3565     @classmethod
3566     def suitable(cls, url):
3567         """Receives a URL and returns True if suitable for this IE."""
3568         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3569
3570     def _real_extract(self, url):
3571         m = re.match(self._VALID_URL, url, re.VERBOSE)
3572         gameID = m.group('gameID')
3573         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3574         self.report_age_confirmation()
3575         webpage = self._download_webpage(videourl, gameID)
3576         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3577         
3578         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3579         mweb = re.finditer(urlRE, webpage)
3580         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3581         titles = re.finditer(namesRE, webpage)
3582         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3583         thumbs = re.finditer(thumbsRE, webpage)
3584         videos = []
3585         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3586             video_id = vid.group('videoID')
3587             title = vtitle.group('videoName')
3588             video_url = vid.group('videoURL')
3589             video_thumb = thumb.group('thumbnail')
3590             if not video_url:
3591                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3592             info = {
3593                 'id':video_id,
3594                 'url':video_url,
3595                 'ext': 'flv',
3596                 'title': unescapeHTML(title),
3597                 'thumbnail': video_thumb
3598                   }
3599             videos.append(info)
3600         return [self.playlist_result(videos, gameID, game_title)]
3601
3602 class UstreamIE(InfoExtractor):
3603     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3604     IE_NAME = u'ustream'
3605
3606     def _real_extract(self, url):
3607         m = re.match(self._VALID_URL, url)
3608         video_id = m.group('videoID')
3609         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3610         webpage = self._download_webpage(url, video_id)
3611         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3612         title = m.group('title')
3613         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3614         uploader = m.group('uploader')
3615         info = {
3616                 'id':video_id,
3617                 'url':video_url,
3618                 'ext': 'flv',
3619                 'title': title,
3620                 'uploader': uploader
3621                   }
3622         return [info]
3623
3624 class WorldStarHipHopIE(InfoExtractor):
3625     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3626     IE_NAME = u'WorldStarHipHop'
3627
3628     def _real_extract(self, url):
3629         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3630
3631         webpage_src = compat_urllib_request.urlopen(url).read()
3632         webpage_src = webpage_src.decode('utf-8')
3633
3634         mobj = re.search(_src_url, webpage_src)
3635
3636         m = re.match(self._VALID_URL, url)
3637         video_id = m.group('id')
3638
3639         if mobj is not None:
3640             video_url = mobj.group()
3641             if 'mp4' in video_url:
3642                 ext = 'mp4'
3643             else:
3644                 ext = 'flv'
3645         else:
3646             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3647             return
3648
3649         _title = r"""<title>(.*)</title>"""
3650
3651         mobj = re.search(_title, webpage_src)
3652
3653         if mobj is not None:
3654             title = mobj.group(1)
3655         else:
3656             title = 'World Start Hip Hop - %s' % time.ctime()
3657
3658         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3659         mobj = re.search(_thumbnail, webpage_src)
3660
3661         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3662         if mobj is not None:
3663             thumbnail = mobj.group(1)
3664         else:
3665             _title = r"""candytitles.*>(.*)</span>"""
3666             mobj = re.search(_title, webpage_src)
3667             if mobj is not None:
3668                 title = mobj.group(1)
3669             thumbnail = None
3670
3671         results = [{
3672                     'id': video_id,
3673                     'url' : video_url,
3674                     'title' : title,
3675                     'thumbnail' : thumbnail,
3676                     'ext' : ext,
3677                     }]
3678         return results
3679
3680 class RBMARadioIE(InfoExtractor):
3681     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3682
3683     def _real_extract(self, url):
3684         m = re.match(self._VALID_URL, url)
3685         video_id = m.group('videoID')
3686
3687         webpage = self._download_webpage(url, video_id)
3688         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3689         if not m:
3690             raise ExtractorError(u'Cannot find metadata')
3691         json_data = m.group(1)
3692
3693         try:
3694             data = json.loads(json_data)
3695         except ValueError as e:
3696             raise ExtractorError(u'Invalid JSON: ' + str(e))
3697
3698         video_url = data['akamai_url'] + '&cbr=256'
3699         url_parts = compat_urllib_parse_urlparse(video_url)
3700         video_ext = url_parts.path.rpartition('.')[2]
3701         info = {
3702                 'id': video_id,
3703                 'url': video_url,
3704                 'ext': video_ext,
3705                 'title': data['title'],
3706                 'description': data.get('teaser_text'),
3707                 'location': data.get('country_of_origin'),
3708                 'uploader': data.get('host', {}).get('name'),
3709                 'uploader_id': data.get('host', {}).get('slug'),
3710                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3711                 'duration': data.get('duration'),
3712         }
3713         return [info]
3714
3715
3716 class YouPornIE(InfoExtractor):
3717     """Information extractor for youporn.com."""
3718     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3719
3720     def _print_formats(self, formats):
3721         """Print all available formats"""
3722         print(u'Available formats:')
3723         print(u'ext\t\tformat')
3724         print(u'---------------------------------')
3725         for format in formats:
3726             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3727
3728     def _specific(self, req_format, formats):
3729         for x in formats:
3730             if(x["format"]==req_format):
3731                 return x
3732         return None
3733
3734     def _real_extract(self, url):
3735         mobj = re.match(self._VALID_URL, url)
3736         if mobj is None:
3737             self._downloader.report_error(u'invalid URL: %s' % url)
3738             return
3739
3740         video_id = mobj.group('videoid')
3741
3742         req = compat_urllib_request.Request(url)
3743         req.add_header('Cookie', 'age_verified=1')
3744         webpage = self._download_webpage(req, video_id)
3745
3746         # Get the video title
3747         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3748         if result is None:
3749             raise ExtractorError(u'Unable to extract video title')
3750         video_title = result.group('title').strip()
3751
3752         # Get the video date
3753         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3754         if result is None:
3755             self._downloader.report_warning(u'unable to extract video date')
3756             upload_date = None
3757         else:
3758             upload_date = unified_strdate(result.group('date').strip())
3759
3760         # Get the video uploader
3761         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3762         if result is None:
3763             self._downloader.report_warning(u'unable to extract uploader')
3764             video_uploader = None
3765         else:
3766             video_uploader = result.group('uploader').strip()
3767             video_uploader = clean_html( video_uploader )
3768
3769         # Get all of the formats available
3770         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3771         result = re.search(DOWNLOAD_LIST_RE, webpage)
3772         if result is None:
3773             raise ExtractorError(u'Unable to extract download list')
3774         download_list_html = result.group('download_list').strip()
3775
3776         # Get all of the links from the page
3777         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3778         links = re.findall(LINK_RE, download_list_html)
3779         if(len(links) == 0):
3780             raise ExtractorError(u'ERROR: no known formats available for video')
3781
3782         self.to_screen(u'Links found: %d' % len(links))
3783
3784         formats = []
3785         for link in links:
3786
3787             # A link looks like this:
3788             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3789             # A path looks like this:
3790             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3791             video_url = unescapeHTML( link )
3792             path = compat_urllib_parse_urlparse( video_url ).path
3793             extension = os.path.splitext( path )[1][1:]
3794             format = path.split('/')[4].split('_')[:2]
3795             size = format[0]
3796             bitrate = format[1]
3797             format = "-".join( format )
3798             title = u'%s-%s-%s' % (video_title, size, bitrate)
3799
3800             formats.append({
3801                 'id': video_id,
3802                 'url': video_url,
3803                 'uploader': video_uploader,
3804                 'upload_date': upload_date,
3805                 'title': title,
3806                 'ext': extension,
3807                 'format': format,
3808                 'thumbnail': None,
3809                 'description': None,
3810                 'player_url': None
3811             })
3812
3813         if self._downloader.params.get('listformats', None):
3814             self._print_formats(formats)
3815             return
3816
3817         req_format = self._downloader.params.get('format', None)
3818         self.to_screen(u'Format: %s' % req_format)
3819
3820         if req_format is None or req_format == 'best':
3821             return [formats[0]]
3822         elif req_format == 'worst':
3823             return [formats[-1]]
3824         elif req_format in ('-1', 'all'):
3825             return formats
3826         else:
3827             format = self._specific( req_format, formats )
3828             if result is None:
3829                 self._downloader.report_error(u'requested format not available')
3830                 return
3831             return [format]
3832
3833
3834
3835 class PornotubeIE(InfoExtractor):
3836     """Information extractor for pornotube.com."""
3837     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3838
3839     def _real_extract(self, url):
3840         mobj = re.match(self._VALID_URL, url)
3841         if mobj is None:
3842             self._downloader.report_error(u'invalid URL: %s' % url)
3843             return
3844
3845         video_id = mobj.group('videoid')
3846         video_title = mobj.group('title')
3847
3848         # Get webpage content
3849         webpage = self._download_webpage(url, video_id)
3850
3851         # Get the video URL
3852         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3853         result = re.search(VIDEO_URL_RE, webpage)
3854         if result is None:
3855             self._downloader.report_error(u'unable to extract video url')
3856             return
3857         video_url = compat_urllib_parse.unquote(result.group('url'))
3858
3859         #Get the uploaded date
3860         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3861         result = re.search(VIDEO_UPLOADED_RE, webpage)
3862         if result is None:
3863             self._downloader.report_error(u'unable to extract video title')
3864             return
3865         upload_date = unified_strdate(result.group('date'))
3866
3867         info = {'id': video_id,
3868                 'url': video_url,
3869                 'uploader': None,
3870                 'upload_date': upload_date,
3871                 'title': video_title,
3872                 'ext': 'flv',
3873                 'format': 'flv'}
3874
3875         return [info]
3876
3877 class YouJizzIE(InfoExtractor):
3878     """Information extractor for youjizz.com."""
3879     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3880
3881     def _real_extract(self, url):
3882         mobj = re.match(self._VALID_URL, url)
3883         if mobj is None:
3884             self._downloader.report_error(u'invalid URL: %s' % url)
3885             return
3886
3887         video_id = mobj.group('videoid')
3888
3889         # Get webpage content
3890         webpage = self._download_webpage(url, video_id)
3891
3892         # Get the video title
3893         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3894         if result is None:
3895             raise ExtractorError(u'ERROR: unable to extract video title')
3896         video_title = result.group('title').strip()
3897
3898         # Get the embed page
3899         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3900         if result is None:
3901             raise ExtractorError(u'ERROR: unable to extract embed page')
3902
3903         embed_page_url = result.group(0).strip()
3904         video_id = result.group('videoid')
3905
3906         webpage = self._download_webpage(embed_page_url, video_id)
3907
3908         # Get the video URL
3909         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3910         if result is None:
3911             raise ExtractorError(u'ERROR: unable to extract video url')
3912         video_url = result.group('source')
3913
3914         info = {'id': video_id,
3915                 'url': video_url,
3916                 'title': video_title,
3917                 'ext': 'flv',
3918                 'format': 'flv',
3919                 'player_url': embed_page_url}
3920
3921         return [info]
3922
3923 class EightTracksIE(InfoExtractor):
3924     IE_NAME = '8tracks'
3925     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3926
3927     def _real_extract(self, url):
3928         mobj = re.match(self._VALID_URL, url)
3929         if mobj is None:
3930             raise ExtractorError(u'Invalid URL: %s' % url)
3931         playlist_id = mobj.group('id')
3932
3933         webpage = self._download_webpage(url, playlist_id)
3934
3935         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3936         if not m:
3937             raise ExtractorError(u'Cannot find trax information')
3938         json_like = m.group(1)
3939         data = json.loads(json_like)
3940
3941         session = str(random.randint(0, 1000000000))
3942         mix_id = data['id']
3943         track_count = data['tracks_count']
3944         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3945         next_url = first_url
3946         res = []
3947         for i in itertools.count():
3948             api_json = self._download_webpage(next_url, playlist_id,
3949                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3950                 errnote=u'Failed to download song information')
3951             api_data = json.loads(api_json)
3952             track_data = api_data[u'set']['track']
3953             info = {
3954                 'id': track_data['id'],
3955                 'url': track_data['track_file_stream_url'],
3956                 'title': track_data['performer'] + u' - ' + track_data['name'],
3957                 'raw_title': track_data['name'],
3958                 'uploader_id': data['user']['login'],
3959                 'ext': 'm4a',
3960             }
3961             res.append(info)
3962             if api_data['set']['at_last_track']:
3963                 break
3964             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3965         return res
3966
3967 class KeekIE(InfoExtractor):
3968     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3969     IE_NAME = u'keek'
3970
3971     def _real_extract(self, url):
3972         m = re.match(self._VALID_URL, url)
3973         video_id = m.group('videoID')
3974         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3975         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3976         webpage = self._download_webpage(url, video_id)
3977         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3978         title = unescapeHTML(m.group('title'))
3979         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3980         uploader = clean_html(m.group('uploader'))
3981         info = {
3982                 'id': video_id,
3983                 'url': video_url,
3984                 'ext': 'mp4',
3985                 'title': title,
3986                 'thumbnail': thumbnail,
3987                 'uploader': uploader
3988         }
3989         return [info]
3990
3991 class TEDIE(InfoExtractor):
3992     _VALID_URL=r'''http://www.ted.com/
3993                    (
3994                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3995                         |
3996                         ((?P<type_talk>talks)) # We have a simple talk
3997                    )
3998                    /(?P<name>\w+) # Here goes the name and then ".html"
3999                    '''
4000
4001     @classmethod
4002     def suitable(cls, url):
4003         """Receives a URL and returns True if suitable for this IE."""
4004         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4005
4006     def _real_extract(self, url):
4007         m=re.match(self._VALID_URL, url, re.VERBOSE)
4008         if m.group('type_talk'):
4009             return [self._talk_info(url)]
4010         else :
4011             playlist_id=m.group('playlist_id')
4012             name=m.group('name')
4013             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4014             return [self._playlist_videos_info(url,name,playlist_id)]
4015
4016     def _talk_video_link(self,mediaSlug):
4017         '''Returns the video link for that mediaSlug'''
4018         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4019
4020     def _playlist_videos_info(self,url,name,playlist_id=0):
4021         '''Returns the videos of the playlist'''
4022         video_RE=r'''
4023                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4024                      ([.\s]*?)data-playlist_item_id="(\d+)"
4025                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4026                      '''
4027         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4028         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4029         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4030         m_names=re.finditer(video_name_RE,webpage)
4031
4032         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4033         m_playlist = re.search(playlist_RE, webpage)
4034         playlist_title = m_playlist.group('playlist_title')
4035
4036         playlist_entries = []
4037         for m_video, m_name in zip(m_videos,m_names):
4038             video_id=m_video.group('video_id')
4039             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4040             playlist_entries.append(self.url_result(talk_url, 'TED'))
4041         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4042
4043     def _talk_info(self, url, video_id=0):
4044         """Return the video for the talk in the url"""
4045         m=re.match(self._VALID_URL, url,re.VERBOSE)
4046         videoName=m.group('name')
4047         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4048         # If the url includes the language we get the title translated
4049         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4050         title=re.search(title_RE, webpage).group('title')
4051         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4052                         "id":(?P<videoID>[\d]+).*?
4053                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4054         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4055         thumb_match=re.search(thumb_RE,webpage)
4056         info_match=re.search(info_RE,webpage,re.VERBOSE)
4057         video_id=info_match.group('videoID')
4058         mediaSlug=info_match.group('mediaSlug')
4059         video_url=self._talk_video_link(mediaSlug)
4060         info = {
4061                 'id': video_id,
4062                 'url': video_url,
4063                 'ext': 'mp4',
4064                 'title': title,
4065                 'thumbnail': thumb_match.group('thumbnail')
4066                 }
4067         return info
4068
4069 class MySpassIE(InfoExtractor):
4070     _VALID_URL = r'http://www.myspass.de/.*'
4071
4072     def _real_extract(self, url):
4073         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4074
4075         # video id is the last path element of the URL
4076         # usually there is a trailing slash, so also try the second but last
4077         url_path = compat_urllib_parse_urlparse(url).path
4078         url_parent_path, video_id = os.path.split(url_path)
4079         if not video_id:
4080             _, video_id = os.path.split(url_parent_path)
4081
4082         # get metadata
4083         metadata_url = META_DATA_URL_TEMPLATE % video_id
4084         metadata_text = self._download_webpage(metadata_url, video_id)
4085         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4086
4087         # extract values from metadata
4088         url_flv_el = metadata.find('url_flv')
4089         if url_flv_el is None:
4090             self._downloader.report_error(u'unable to extract download url')
4091             return
4092         video_url = url_flv_el.text
4093         extension = os.path.splitext(video_url)[1][1:]
4094         title_el = metadata.find('title')
4095         if title_el is None:
4096             self._downloader.report_error(u'unable to extract title')
4097             return
4098         title = title_el.text
4099         format_id_el = metadata.find('format_id')
4100         if format_id_el is None:
4101             format = ext
4102         else:
4103             format = format_id_el.text
4104         description_el = metadata.find('description')
4105         if description_el is not None:
4106             description = description_el.text
4107         else:
4108             description = None
4109         imagePreview_el = metadata.find('imagePreview')
4110         if imagePreview_el is not None:
4111             thumbnail = imagePreview_el.text
4112         else:
4113             thumbnail = None
4114         info = {
4115             'id': video_id,
4116             'url': video_url,
4117             'title': title,
4118             'ext': extension,
4119             'format': format,
4120             'thumbnail': thumbnail,
4121             'description': description
4122         }
4123         return [info]
4124
4125 class SpiegelIE(InfoExtractor):
4126     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4127
4128     def _real_extract(self, url):
4129         m = re.match(self._VALID_URL, url)
4130         video_id = m.group('videoID')
4131
4132         webpage = self._download_webpage(url, video_id)
4133         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4134         if not m:
4135             raise ExtractorError(u'Cannot find title')
4136         video_title = unescapeHTML(m.group(1))
4137
4138         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4139         xml_code = self._download_webpage(xml_url, video_id,
4140                     note=u'Downloading XML', errnote=u'Failed to download XML')
4141
4142         idoc = xml.etree.ElementTree.fromstring(xml_code)
4143         last_type = idoc[-1]
4144         filename = last_type.findall('./filename')[0].text
4145         duration = float(last_type.findall('./duration')[0].text)
4146
4147         video_url = 'http://video2.spiegel.de/flash/' + filename
4148         video_ext = filename.rpartition('.')[2]
4149         info = {
4150             'id': video_id,
4151             'url': video_url,
4152             'ext': video_ext,
4153             'title': video_title,
4154             'duration': duration,
4155         }
4156         return [info]
4157
4158 class LiveLeakIE(InfoExtractor):
4159
4160     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4161     IE_NAME = u'liveleak'
4162
4163     def _real_extract(self, url):
4164         mobj = re.match(self._VALID_URL, url)
4165         if mobj is None:
4166             self._downloader.report_error(u'invalid URL: %s' % url)
4167             return
4168
4169         video_id = mobj.group('video_id')
4170
4171         webpage = self._download_webpage(url, video_id)
4172
4173         m = re.search(r'file: "(.*?)",', webpage)
4174         if not m:
4175             self._downloader.report_error(u'unable to find video url')
4176             return
4177         video_url = m.group(1)
4178
4179         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4180         if not m:
4181             self._downloader.report_error(u'Cannot find video title')
4182         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4183
4184         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4185         if m:
4186             desc = unescapeHTML(m.group('desc'))
4187         else:
4188             desc = None
4189
4190         m = re.search(r'By:.*?(\w+)</a>', webpage)
4191         if m:
4192             uploader = clean_html(m.group(1))
4193         else:
4194             uploader = None
4195
4196         info = {
4197             'id':  video_id,
4198             'url': video_url,
4199             'ext': 'mp4',
4200             'title': title,
4201             'description': desc,
4202             'uploader': uploader
4203         }
4204
4205         return [info]
4206
4207 class ARDIE(InfoExtractor):
4208     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4209     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4210     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4211
4212     def _real_extract(self, url):
4213         # determine video id from url
4214         m = re.match(self._VALID_URL, url)
4215
4216         numid = re.search(r'documentId=([0-9]+)', url)
4217         if numid:
4218             video_id = numid.group(1)
4219         else:
4220             video_id = m.group('video_id')
4221
4222         # determine title and media streams from webpage
4223         html = self._download_webpage(url, video_id)
4224         title = re.search(self._TITLE, html).group('title')
4225         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4226         if not streams:
4227             assert '"fsk"' in html
4228             self._downloader.report_error(u'this video is only available after 8:00 pm')
4229             return
4230
4231         # choose default media type and highest quality for now
4232         stream = max([s for s in streams if int(s["media_type"]) == 0],
4233                      key=lambda s: int(s["quality"]))
4234
4235         # there's two possibilities: RTMP stream or HTTP download
4236         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4237         if stream['rtmp_url']:
4238             self.to_screen(u'RTMP download detected')
4239             assert stream['video_url'].startswith('mp4:')
4240             info["url"] = stream["rtmp_url"]
4241             info["play_path"] = stream['video_url']
4242         else:
4243             assert stream["video_url"].endswith('.mp4')
4244             info["url"] = stream["video_url"]
4245         return [info]
4246
4247 class TumblrIE(InfoExtractor):
4248     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4249
4250     def _real_extract(self, url):
4251         m_url = re.match(self._VALID_URL, url)
4252         video_id = m_url.group('id')
4253         blog = m_url.group('blog_name')
4254
4255         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4256         webpage = self._download_webpage(url, video_id)
4257
4258         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4259         video = re.search(re_video, webpage)
4260         if video is None:
4261             self.to_screen("No video founded")
4262             return []
4263         video_url = video.group('video_url')
4264         ext = video.group('ext')
4265
4266         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4267         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4268
4269         # The only place where you can get a title, it's not complete,
4270         # but searching in other places doesn't work for all videos
4271         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4272         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4273
4274         return [{'id': video_id,
4275                  'url': video_url,
4276                  'title': title,
4277                  'thumbnail': thumb,
4278                  'ext': ext
4279                  }]
4280
4281
4282 def gen_extractors():
4283     """ Return a list of an instance of every supported extractor.
4284     The order does matter; the first extractor matched is the one handling the URL.
4285     """
4286     return [
4287         YoutubePlaylistIE(),
4288         YoutubeChannelIE(),
4289         YoutubeUserIE(),
4290         YoutubeSearchIE(),
4291         YoutubeIE(),
4292         MetacafeIE(),
4293         DailymotionIE(),
4294         GoogleSearchIE(),
4295         PhotobucketIE(),
4296         YahooIE(),
4297         YahooSearchIE(),
4298         DepositFilesIE(),
4299         FacebookIE(),
4300         BlipTVUserIE(),
4301         BlipTVIE(),
4302         VimeoIE(),
4303         MyVideoIE(),
4304         ComedyCentralIE(),
4305         EscapistIE(),
4306         CollegeHumorIE(),
4307         XVideosIE(),
4308         SoundcloudSetIE(),
4309         SoundcloudIE(),
4310         InfoQIE(),
4311         MixcloudIE(),
4312         StanfordOpenClassroomIE(),
4313         MTVIE(),
4314         YoukuIE(),
4315         XNXXIE(),
4316         YouJizzIE(),
4317         PornotubeIE(),
4318         YouPornIE(),
4319         GooglePlusIE(),
4320         ArteTvIE(),
4321         NBAIE(),
4322         WorldStarHipHopIE(),
4323         JustinTVIE(),
4324         FunnyOrDieIE(),
4325         SteamIE(),
4326         UstreamIE(),
4327         RBMARadioIE(),
4328         EightTracksIE(),
4329         KeekIE(),
4330         TEDIE(),
4331         MySpassIE(),
4332         SpiegelIE(),
4333         LiveLeakIE(),
4334         ARDIE(),
4335         TumblrIE(),
4336         GenericIE()
4337     ]
4338
4339 def get_info_extractor(ie_name):
4340     """Returns the info extractor class with the given ie_name"""
4341     return globals()[ie_name+'IE']