Improve Twitch.tv chapter support (#810)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns a tuple (page content as string, URL handle) """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         content = webpage_bytes.decode(encoding, 'replace')
146         return (content, urlh)
147
148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149         """ Returns the data of the page as a string """
150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152     def to_screen(self, msg):
153         """Print msg to screen, prefixing it with '[ie_name]'"""
154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156     def report_extraction(self, id_or_name):
157         """Report information extraction."""
158         self.to_screen(u'%s: Extracting information' % id_or_name)
159
160     def report_download_webpage(self, video_id):
161         """Report webpage download."""
162         self.to_screen(u'%s: Downloading webpage' % video_id)
163
164     def report_age_confirmation(self):
165         """Report attempt to confirm age."""
166         self.to_screen(u'Confirming age')
167
168     #Methods for following #608
169     #They set the correct value of the '_type' key
170     def video_result(self, video_info):
171         """Returns a video"""
172         video_info['_type'] = 'video'
173         return video_info
174     def url_result(self, url, ie=None):
175         """Returns a url that points to a page that should be processed"""
176         #TODO: ie should be the class used for getting the info
177         video_info = {'_type': 'url',
178                       'url': url,
179                       'ie_key': ie}
180         return video_info
181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182         """Returns a playlist"""
183         video_info = {'_type': 'playlist',
184                       'entries': entries}
185         if playlist_id:
186             video_info['id'] = playlist_id
187         if playlist_title:
188             video_info['title'] = playlist_title
189         return video_info
190
191
192 class YoutubeIE(InfoExtractor):
193     """Information extractor for youtube.com."""
194
195     _VALID_URL = r"""^
196                      (
197                          (?:https?://)?                                       # http(s):// (optional)
198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
201                          (?:                                                  # the various things that can precede the ID:
202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
203                              |(?:                                             # or the v= param in all its forms
204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
207                                  v=
208                              )
209                          )?                                                   # optional -> youtube.com/xxxx is OK
210                      )?                                                       # all until now is optional -> you can pass the naked ID
211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
212                      (?(1).+)?                                                # if we found the ID, everything can follow
213                      $"""
214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218     _NETRC_MACHINE = 'youtube'
219     # Listed in order of quality
220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222     _video_extensions = {
223         '13': '3gp',
224         '17': 'mp4',
225         '18': 'mp4',
226         '22': 'mp4',
227         '37': 'mp4',
228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229         '43': 'webm',
230         '44': 'webm',
231         '45': 'webm',
232         '46': 'webm',
233     }
234     _video_dimensions = {
235         '5': '240x400',
236         '6': '???',
237         '13': '???',
238         '17': '144x176',
239         '18': '360x640',
240         '22': '720x1280',
241         '34': '360x640',
242         '35': '480x854',
243         '37': '1080x1920',
244         '38': '3072x4096',
245         '43': '360x640',
246         '44': '480x854',
247         '45': '720x1280',
248         '46': '1080x1920',
249     }
250     IE_NAME = u'youtube'
251
252     @classmethod
253     def suitable(cls, url):
254         """Receives a URL and returns True if suitable for this IE."""
255         if YoutubePlaylistIE.suitable(url): return False
256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
257
258     def report_lang(self):
259         """Report attempt to set language."""
260         self.to_screen(u'Setting language')
261
262     def report_login(self):
263         """Report attempt to log in."""
264         self.to_screen(u'Logging in')
265
266     def report_video_webpage_download(self, video_id):
267         """Report attempt to download video webpage."""
268         self.to_screen(u'%s: Downloading video webpage' % video_id)
269
270     def report_video_info_webpage_download(self, video_id):
271         """Report attempt to download video info webpage."""
272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
273
274     def report_video_subtitles_download(self, video_id):
275         """Report attempt to download video info webpage."""
276         self.to_screen(u'%s: Checking available subtitles' % video_id)
277
278     def report_video_subtitles_request(self, video_id, sub_lang, format):
279         """Report attempt to download video info webpage."""
280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
281
282     def report_video_subtitles_available(self, video_id, sub_lang_list):
283         """Report available subtitles."""
284         sub_lang = ",".join(list(sub_lang_list.keys()))
285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
286
287     def report_information_extraction(self, video_id):
288         """Report attempt to extract video information."""
289         self.to_screen(u'%s: Extracting video information' % video_id)
290
291     def report_unavailable_format(self, video_id, format):
292         """Report extracted video URL."""
293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
294
295     def report_rtmp_download(self):
296         """Indicate the download will use the RTMP protocol."""
297         self.to_screen(u'RTMP download detected')
298
299     def _get_available_subtitles(self, video_id):
300         self.report_video_subtitles_download(video_id)
301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
302         try:
303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308         if not sub_lang_list:
309             return (u'video doesn\'t have subtitles', None)
310         return sub_lang_list
311
312     def _list_available_subtitles(self, video_id):
313         sub_lang_list = self._get_available_subtitles(video_id)
314         self.report_video_subtitles_available(video_id, sub_lang_list)
315
316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
317         """
318         Return tuple:
319         (error_message, sub_lang, sub)
320         """
321         self.report_video_subtitles_request(video_id, sub_lang, format)
322         params = compat_urllib_parse.urlencode({
323             'lang': sub_lang,
324             'name': sub_name,
325             'v': video_id,
326             'fmt': format,
327         })
328         url = 'http://www.youtube.com/api/timedtext?' + params
329         try:
330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
333         if not sub:
334             return (u'Did not fetch video subtitles', None, None)
335         return (None, sub_lang, sub)
336
337     def _extract_subtitle(self, video_id):
338         """
339         Return a list with a tuple:
340         [(error_message, sub_lang, sub)]
341         """
342         sub_lang_list = self._get_available_subtitles(video_id)
343         sub_format = self._downloader.params.get('subtitlesformat')
344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345             return [(sub_lang_list[0], None, None)]
346         if self._downloader.params.get('subtitleslang', False):
347             sub_lang = self._downloader.params.get('subtitleslang')
348         elif 'en' in sub_lang_list:
349             sub_lang = 'en'
350         else:
351             sub_lang = list(sub_lang_list.keys())[0]
352         if not sub_lang in sub_lang_list:
353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
354
355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
356         return [subtitle]
357
358     def _extract_all_subtitles(self, video_id):
359         sub_lang_list = self._get_available_subtitles(video_id)
360         sub_format = self._downloader.params.get('subtitlesformat')
361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362             return [(sub_lang_list[0], None, None)]
363         subtitles = []
364         for sub_lang in sub_lang_list:
365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366             subtitles.append(subtitle)
367         return subtitles
368
369     def _print_formats(self, formats):
370         print('Available formats:')
371         for x in formats:
372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
373
374     def _real_initialize(self):
375         if self._downloader is None:
376             return
377
378         username = None
379         password = None
380         downloader_params = self._downloader.params
381
382         # Attempt to use provided username and password or .netrc data
383         if downloader_params.get('username', None) is not None:
384             username = downloader_params['username']
385             password = downloader_params['password']
386         elif downloader_params.get('usenetrc', False):
387             try:
388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389                 if info is not None:
390                     username = info[0]
391                     password = info[2]
392                 else:
393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394             except (IOError, netrc.NetrcParseError) as err:
395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
396                 return
397
398         # Set language
399         request = compat_urllib_request.Request(self._LANG_URL)
400         try:
401             self.report_lang()
402             compat_urllib_request.urlopen(request).read()
403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
405             return
406
407         # No authentication to be performed
408         if username is None:
409             return
410
411         request = compat_urllib_request.Request(self._LOGIN_URL)
412         try:
413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
416             return
417
418         galx = None
419         dsh = None
420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
421         if match:
422           galx = match.group(1)
423
424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425         if match:
426           dsh = match.group(1)
427
428         # Log in
429         login_form_strs = {
430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
431                 u'Email': username,
432                 u'GALX': galx,
433                 u'Passwd': password,
434                 u'PersistentCookie': u'yes',
435                 u'_utf8': u'霱',
436                 u'bgresponse': u'js_disabled',
437                 u'checkConnection': u'',
438                 u'checkedDomains': u'youtube',
439                 u'dnConn': u'',
440                 u'dsh': dsh,
441                 u'pstMsg': u'0',
442                 u'rmShown': u'1',
443                 u'secTok': u'',
444                 u'signIn': u'Sign in',
445                 u'timeStmp': u'',
446                 u'service': u'youtube',
447                 u'uilel': u'3',
448                 u'hl': u'en_US',
449         }
450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
451         # chokes on unicode
452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
455         try:
456             self.report_login()
457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459                 self._downloader.report_warning(u'unable to log in: bad username or password')
460                 return
461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463             return
464
465         # Confirm age
466         age_form = {
467                 'next_url':     '/',
468                 'action_confirm':   'Confirm',
469                 }
470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
471         try:
472             self.report_age_confirmation()
473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
476             return
477
478     def _extract_id(self, url):
479         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480         if mobj is None:
481             self._downloader.report_error(u'invalid URL: %s' % url)
482             return
483         video_id = mobj.group(2)
484         return video_id
485
486     def _real_extract(self, url):
487         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
488         mobj = re.search(self._NEXT_URL_RE, url)
489         if mobj:
490             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
491         video_id = self._extract_id(url)
492
493         # Get video webpage
494         self.report_video_webpage_download(video_id)
495         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
496         request = compat_urllib_request.Request(url)
497         try:
498             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
499         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
500             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
501             return
502
503         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
504
505         # Attempt to extract SWF player URL
506         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
507         if mobj is not None:
508             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
509         else:
510             player_url = None
511
512         # Get video info
513         self.report_video_info_webpage_download(video_id)
514         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
515             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
516                     % (video_id, el_type))
517             video_info_webpage = self._download_webpage(video_info_url, video_id,
518                                     note=False,
519                                     errnote='unable to download video info webpage')
520             video_info = compat_parse_qs(video_info_webpage)
521             if 'token' in video_info:
522                 break
523         if 'token' not in video_info:
524             if 'reason' in video_info:
525                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
526             else:
527                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
528             return
529
530         # Check for "rental" videos
531         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
532             self._downloader.report_error(u'"rental" videos not supported')
533             return
534
535         # Start extracting information
536         self.report_information_extraction(video_id)
537
538         # uploader
539         if 'author' not in video_info:
540             self._downloader.report_error(u'unable to extract uploader name')
541             return
542         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
543
544         # uploader_id
545         video_uploader_id = None
546         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
547         if mobj is not None:
548             video_uploader_id = mobj.group(1)
549         else:
550             self._downloader.report_warning(u'unable to extract uploader nickname')
551
552         # title
553         if 'title' not in video_info:
554             self._downloader.report_error(u'unable to extract video title')
555             return
556         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
557
558         # thumbnail image
559         if 'thumbnail_url' not in video_info:
560             self._downloader.report_warning(u'unable to extract video thumbnail')
561             video_thumbnail = ''
562         else:   # don't panic if we can't find it
563             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
564
565         # upload date
566         upload_date = None
567         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
568         if mobj is not None:
569             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
570             upload_date = unified_strdate(upload_date)
571
572         # description
573         video_description = get_element_by_id("eow-description", video_webpage)
574         if video_description:
575             video_description = clean_html(video_description)
576         else:
577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
578             if fd_mobj:
579                 video_description = unescapeHTML(fd_mobj.group(1))
580             else:
581                 video_description = u''
582
583         # subtitles
584         video_subtitles = None
585
586         if self._downloader.params.get('writesubtitles', False):
587             video_subtitles = self._extract_subtitle(video_id)
588             if video_subtitles:
589                 (sub_error, sub_lang, sub) = video_subtitles[0]
590                 if sub_error:
591                     self._downloader.report_error(sub_error)
592
593         if self._downloader.params.get('allsubtitles', False):
594             video_subtitles = self._extract_all_subtitles(video_id)
595             for video_subtitle in video_subtitles:
596                 (sub_error, sub_lang, sub) = video_subtitle
597                 if sub_error:
598                     self._downloader.report_error(sub_error)
599
600         if self._downloader.params.get('listsubtitles', False):
601             sub_lang_list = self._list_available_subtitles(video_id)
602             return
603
604         if 'length_seconds' not in video_info:
605             self._downloader.report_warning(u'unable to extract video duration')
606             video_duration = ''
607         else:
608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
609
610         # token
611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
612
613         # Decide which formats to download
614         req_format = self._downloader.params.get('format', None)
615
616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617             self.report_rtmp_download()
618             video_url_list = [(None, video_info['conn'][0])]
619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
624
625             format_limit = self._downloader.params.get('format_limit', None)
626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627             if format_limit is not None and format_limit in available_formats:
628                 format_list = available_formats[available_formats.index(format_limit):]
629             else:
630                 format_list = available_formats
631             existing_formats = [x for x in format_list if x in url_map]
632             if len(existing_formats) == 0:
633                 raise ExtractorError(u'no known formats available for video')
634             if self._downloader.params.get('listformats', None):
635                 self._print_formats(existing_formats)
636                 return
637             if req_format is None or req_format == 'best':
638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639             elif req_format == 'worst':
640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641             elif req_format in ('-1', 'all'):
642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
643             else:
644                 # Specific formats. We pick the first in a slash-delimeted sequence.
645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646                 req_formats = req_format.split('/')
647                 video_url_list = None
648                 for rf in req_formats:
649                     if rf in url_map:
650                         video_url_list = [(rf, url_map[rf])]
651                         break
652                 if video_url_list is None:
653                     raise ExtractorError(u'requested format not available')
654         else:
655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
656
657         results = []
658         for format_param, video_real_url in video_url_list:
659             # Extension
660             video_extension = self._video_extensions.get(format_param, 'flv')
661
662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663                                               self._video_dimensions.get(format_param, '???'))
664
665             results.append({
666                 'id':       video_id,
667                 'url':      video_real_url,
668                 'uploader': video_uploader,
669                 'uploader_id': video_uploader_id,
670                 'upload_date':  upload_date,
671                 'title':    video_title,
672                 'ext':      video_extension,
673                 'format':   video_format,
674                 'thumbnail':    video_thumbnail,
675                 'description':  video_description,
676                 'player_url':   player_url,
677                 'subtitles':    video_subtitles,
678                 'duration':     video_duration
679             })
680         return results
681
682
683 class MetacafeIE(InfoExtractor):
684     """Information Extractor for metacafe.com."""
685
686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689     IE_NAME = u'metacafe'
690
691     def report_disclaimer(self):
692         """Report disclaimer retrieval."""
693         self.to_screen(u'Retrieving disclaimer')
694
695     def _real_initialize(self):
696         # Retrieve disclaimer
697         request = compat_urllib_request.Request(self._DISCLAIMER)
698         try:
699             self.report_disclaimer()
700             disclaimer = compat_urllib_request.urlopen(request).read()
701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703             return
704
705         # Confirm age
706         disclaimer_form = {
707             'filters': '0',
708             'submit': "Continue - I'm over 18",
709             }
710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
711         try:
712             self.report_age_confirmation()
713             disclaimer = compat_urllib_request.urlopen(request).read()
714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
716             return
717
718     def _real_extract(self, url):
719         # Extract id and simplified title from URL
720         mobj = re.match(self._VALID_URL, url)
721         if mobj is None:
722             self._downloader.report_error(u'invalid URL: %s' % url)
723             return
724
725         video_id = mobj.group(1)
726
727         # Check if video comes from YouTube
728         mobj2 = re.match(r'^yt-(.*)$', video_id)
729         if mobj2 is not None:
730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
731
732         # Retrieve video webpage to extract further information
733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
734
735         # Extract URL, uploader and title from webpage
736         self.report_extraction(video_id)
737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
738         if mobj is not None:
739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740             video_extension = mediaURL[-3:]
741
742             # Extract gdaKey if available
743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
744             if mobj is None:
745                 video_url = mediaURL
746             else:
747                 gdaKey = mobj.group(1)
748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
749         else:
750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
751             if mobj is None:
752                 self._downloader.report_error(u'unable to extract media URL')
753                 return
754             vardict = compat_parse_qs(mobj.group(1))
755             if 'mediaData' not in vardict:
756                 self._downloader.report_error(u'unable to extract media URL')
757                 return
758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
759             if mobj is None:
760                 self._downloader.report_error(u'unable to extract media URL')
761                 return
762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763             video_extension = mediaURL[-3:]
764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
765
766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
767         if mobj is None:
768             self._downloader.report_error(u'unable to extract title')
769             return
770         video_title = mobj.group(1).decode('utf-8')
771
772         mobj = re.search(r'submitter=(.*?);', webpage)
773         if mobj is None:
774             self._downloader.report_error(u'unable to extract uploader nickname')
775             return
776         video_uploader = mobj.group(1)
777
778         return [{
779             'id':       video_id.decode('utf-8'),
780             'url':      video_url.decode('utf-8'),
781             'uploader': video_uploader.decode('utf-8'),
782             'upload_date':  None,
783             'title':    video_title,
784             'ext':      video_extension.decode('utf-8'),
785         }]
786
787 class DailymotionIE(InfoExtractor):
788     """Information Extractor for Dailymotion"""
789
790     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
791     IE_NAME = u'dailymotion'
792
793     def _real_extract(self, url):
794         # Extract id and simplified title from URL
795         mobj = re.match(self._VALID_URL, url)
796         if mobj is None:
797             self._downloader.report_error(u'invalid URL: %s' % url)
798             return
799
800         video_id = mobj.group(1).split('_')[0].split('?')[0]
801
802         video_extension = 'mp4'
803
804         # Retrieve video webpage to extract further information
805         request = compat_urllib_request.Request(url)
806         request.add_header('Cookie', 'family_filter=off')
807         webpage = self._download_webpage(request, video_id)
808
809         # Extract URL, uploader and title from webpage
810         self.report_extraction(video_id)
811         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
812         if mobj is None:
813             self._downloader.report_error(u'unable to extract media URL')
814             return
815         flashvars = compat_urllib_parse.unquote(mobj.group(1))
816
817         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
818             if key in flashvars:
819                 max_quality = key
820                 self.to_screen(u'Using %s' % key)
821                 break
822         else:
823             self._downloader.report_error(u'unable to extract video URL')
824             return
825
826         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
827         if mobj is None:
828             self._downloader.report_error(u'unable to extract video URL')
829             return
830
831         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
832
833         # TODO: support choosing qualities
834
835         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
836         if mobj is None:
837             self._downloader.report_error(u'unable to extract title')
838             return
839         video_title = unescapeHTML(mobj.group('title'))
840
841         video_uploader = None
842         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
843         if mobj is None:
844             # lookin for official user
845             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
846             if mobj_official is None:
847                 self._downloader.report_warning(u'unable to extract uploader nickname')
848             else:
849                 video_uploader = mobj_official.group(1)
850         else:
851             video_uploader = mobj.group(1)
852
853         video_upload_date = None
854         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
855         if mobj is not None:
856             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
857
858         return [{
859             'id':       video_id,
860             'url':      video_url,
861             'uploader': video_uploader,
862             'upload_date':  video_upload_date,
863             'title':    video_title,
864             'ext':      video_extension,
865         }]
866
867
868 class PhotobucketIE(InfoExtractor):
869     """Information extractor for photobucket.com."""
870
871     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
872     IE_NAME = u'photobucket'
873
874     def _real_extract(self, url):
875         # Extract id from URL
876         mobj = re.match(self._VALID_URL, url)
877         if mobj is None:
878             self._downloader.report_error(u'Invalid URL: %s' % url)
879             return
880
881         video_id = mobj.group(1)
882
883         video_extension = 'flv'
884
885         # Retrieve video webpage to extract further information
886         request = compat_urllib_request.Request(url)
887         try:
888             self.report_download_webpage(video_id)
889             webpage = compat_urllib_request.urlopen(request).read()
890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
891             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
892             return
893
894         # Extract URL, uploader, and title from webpage
895         self.report_extraction(video_id)
896         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
897         if mobj is None:
898             self._downloader.report_error(u'unable to extract media URL')
899             return
900         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
901
902         video_url = mediaURL
903
904         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
905         if mobj is None:
906             self._downloader.report_error(u'unable to extract title')
907             return
908         video_title = mobj.group(1).decode('utf-8')
909
910         video_uploader = mobj.group(2).decode('utf-8')
911
912         return [{
913             'id':       video_id.decode('utf-8'),
914             'url':      video_url.decode('utf-8'),
915             'uploader': video_uploader,
916             'upload_date':  None,
917             'title':    video_title,
918             'ext':      video_extension.decode('utf-8'),
919         }]
920
921
922 class YahooIE(InfoExtractor):
923     """Information extractor for video.yahoo.com."""
924
925     _WORKING = False
926     # _VALID_URL matches all Yahoo! Video URLs
927     # _VPAGE_URL matches only the extractable '/watch/' URLs
928     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
929     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
930     IE_NAME = u'video.yahoo'
931
932     def _real_extract(self, url, new_video=True):
933         # Extract ID from URL
934         mobj = re.match(self._VALID_URL, url)
935         if mobj is None:
936             self._downloader.report_error(u'Invalid URL: %s' % url)
937             return
938
939         video_id = mobj.group(2)
940         video_extension = 'flv'
941
942         # Rewrite valid but non-extractable URLs as
943         # extractable English language /watch/ URLs
944         if re.match(self._VPAGE_URL, url) is None:
945             request = compat_urllib_request.Request(url)
946             try:
947                 webpage = compat_urllib_request.urlopen(request).read()
948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
950                 return
951
952             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
953             if mobj is None:
954                 self._downloader.report_error(u'Unable to extract id field')
955                 return
956             yahoo_id = mobj.group(1)
957
958             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
959             if mobj is None:
960                 self._downloader.report_error(u'Unable to extract vid field')
961                 return
962             yahoo_vid = mobj.group(1)
963
964             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965             return self._real_extract(url, new_video=False)
966
967         # Retrieve video webpage to extract further information
968         request = compat_urllib_request.Request(url)
969         try:
970             self.report_download_webpage(video_id)
971             webpage = compat_urllib_request.urlopen(request).read()
972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
974             return
975
976         # Extract uploader and title from webpage
977         self.report_extraction(video_id)
978         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
979         if mobj is None:
980             self._downloader.report_error(u'unable to extract video title')
981             return
982         video_title = mobj.group(1).decode('utf-8')
983
984         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
985         if mobj is None:
986             self._downloader.report_error(u'unable to extract video uploader')
987             return
988         video_uploader = mobj.group(1).decode('utf-8')
989
990         # Extract video thumbnail
991         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
992         if mobj is None:
993             self._downloader.report_error(u'unable to extract video thumbnail')
994             return
995         video_thumbnail = mobj.group(1).decode('utf-8')
996
997         # Extract video description
998         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video description')
1001             return
1002         video_description = mobj.group(1).decode('utf-8')
1003         if not video_description:
1004             video_description = 'No description available.'
1005
1006         # Extract video height and width
1007         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video height')
1010             return
1011         yv_video_height = mobj.group(1)
1012
1013         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video width')
1016             return
1017         yv_video_width = mobj.group(1)
1018
1019         # Retrieve video playlist to extract media URL
1020         # I'm not completely sure what all these options are, but we
1021         # seem to need most of them, otherwise the server sends a 401.
1022         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1023         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1024         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1027         try:
1028             self.report_download_webpage(video_id)
1029             webpage = compat_urllib_request.urlopen(request).read()
1030         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1032             return
1033
1034         # Extract media URL from playlist XML
1035         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'Unable to extract media URL')
1038             return
1039         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040         video_url = unescapeHTML(video_url)
1041
1042         return [{
1043             'id':       video_id.decode('utf-8'),
1044             'url':      video_url,
1045             'uploader': video_uploader,
1046             'upload_date':  None,
1047             'title':    video_title,
1048             'ext':      video_extension.decode('utf-8'),
1049             'thumbnail':    video_thumbnail.decode('utf-8'),
1050             'description':  video_description,
1051         }]
1052
1053
1054 class VimeoIE(InfoExtractor):
1055     """Information extractor for vimeo.com."""
1056
1057     # _VALID_URL matches Vimeo URLs
1058     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1059     IE_NAME = u'vimeo'
1060
1061     def _real_extract(self, url, new_video=True):
1062         # Extract ID from URL
1063         mobj = re.match(self._VALID_URL, url)
1064         if mobj is None:
1065             self._downloader.report_error(u'Invalid URL: %s' % url)
1066             return
1067
1068         video_id = mobj.group('id')
1069         if not mobj.group('proto'):
1070             url = 'https://' + url
1071         if mobj.group('direct_link'):
1072             url = 'https://vimeo.com/' + video_id
1073
1074         # Retrieve video webpage to extract further information
1075         request = compat_urllib_request.Request(url, None, std_headers)
1076         webpage = self._download_webpage(request, video_id)
1077
1078         # Now we begin extracting as much information as we can from what we
1079         # retrieved. First we extract the information common to all extractors,
1080         # and latter we extract those that are Vimeo specific.
1081         self.report_extraction(video_id)
1082
1083         # Extract the config JSON
1084         try:
1085             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1086             config = json.loads(config)
1087         except:
1088             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1089                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1090             else:
1091                 self._downloader.report_error(u'unable to extract info section')
1092             return
1093
1094         # Extract title
1095         video_title = config["video"]["title"]
1096
1097         # Extract uploader and uploader_id
1098         video_uploader = config["video"]["owner"]["name"]
1099         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1100
1101         # Extract video thumbnail
1102         video_thumbnail = config["video"]["thumbnail"]
1103
1104         # Extract video description
1105         video_description = get_element_by_attribute("itemprop", "description", webpage)
1106         if video_description: video_description = clean_html(video_description)
1107         else: video_description = u''
1108
1109         # Extract upload date
1110         video_upload_date = None
1111         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1112         if mobj is not None:
1113             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1114
1115         # Vimeo specific: extract request signature and timestamp
1116         sig = config['request']['signature']
1117         timestamp = config['request']['timestamp']
1118
1119         # Vimeo specific: extract video codec and quality information
1120         # First consider quality, then codecs, then take everything
1121         # TODO bind to format param
1122         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1123         files = { 'hd': [], 'sd': [], 'other': []}
1124         for codec_name, codec_extension in codecs:
1125             if codec_name in config["video"]["files"]:
1126                 if 'hd' in config["video"]["files"][codec_name]:
1127                     files['hd'].append((codec_name, codec_extension, 'hd'))
1128                 elif 'sd' in config["video"]["files"][codec_name]:
1129                     files['sd'].append((codec_name, codec_extension, 'sd'))
1130                 else:
1131                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1132
1133         for quality in ('hd', 'sd', 'other'):
1134             if len(files[quality]) > 0:
1135                 video_quality = files[quality][0][2]
1136                 video_codec = files[quality][0][0]
1137                 video_extension = files[quality][0][1]
1138                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1139                 break
1140         else:
1141             self._downloader.report_error(u'no known codec found')
1142             return
1143
1144         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1145                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1146
1147         return [{
1148             'id':       video_id,
1149             'url':      video_url,
1150             'uploader': video_uploader,
1151             'uploader_id': video_uploader_id,
1152             'upload_date':  video_upload_date,
1153             'title':    video_title,
1154             'ext':      video_extension,
1155             'thumbnail':    video_thumbnail,
1156             'description':  video_description,
1157         }]
1158
1159
1160 class ArteTvIE(InfoExtractor):
1161     """arte.tv information extractor."""
1162
1163     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1164     _LIVE_URL = r'index-[0-9]+\.html$'
1165
1166     IE_NAME = u'arte.tv'
1167
1168     def fetch_webpage(self, url):
1169         request = compat_urllib_request.Request(url)
1170         try:
1171             self.report_download_webpage(url)
1172             webpage = compat_urllib_request.urlopen(request).read()
1173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1175             return
1176         except ValueError as err:
1177             self._downloader.report_error(u'Invalid URL: %s' % url)
1178             return
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             self._downloader.report_error(u'Invalid URL: %s' % url)
1188             return
1189
1190         for (i, key, err) in matchTuples:
1191             if mobj.group(i) is None:
1192                 self._downloader.report_error(err)
1193                 return
1194             else:
1195                 info[key] = mobj.group(i)
1196
1197         return info
1198
1199     def extractLiveStream(self, url):
1200         video_lang = url.split('/')[-4]
1201         info = self.grep_webpage(
1202             url,
1203             r'src="(.*?/videothek_js.*?\.js)',
1204             0,
1205             [
1206                 (1, 'url', u'Invalid URL: %s' % url)
1207             ]
1208         )
1209         http_host = url.split('/')[2]
1210         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211         info = self.grep_webpage(
1212             next_url,
1213             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214                 '(http://.*?\.swf).*?' +
1215                 '(rtmp://.*?)\'',
1216             re.DOTALL,
1217             [
1218                 (1, 'path',   u'could not extract video path: %s' % url),
1219                 (2, 'player', u'could not extract video player: %s' % url),
1220                 (3, 'url',    u'could not extract video url: %s' % url)
1221             ]
1222         )
1223         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224
1225     def extractPlus7Stream(self, url):
1226         video_lang = url.split('/')[-3]
1227         info = self.grep_webpage(
1228             url,
1229             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230             0,
1231             [
1232                 (1, 'url', u'Invalid URL: %s' % url)
1233             ]
1234         )
1235         next_url = compat_urllib_parse.unquote(info.get('url'))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239             0,
1240             [
1241                 (1, 'url', u'Could not find <video> tag: %s' % url)
1242             ]
1243         )
1244         next_url = compat_urllib_parse.unquote(info.get('url'))
1245
1246         info = self.grep_webpage(
1247             next_url,
1248             r'<video id="(.*?)".*?>.*?' +
1249                 '<name>(.*?)</name>.*?' +
1250                 '<dateVideo>(.*?)</dateVideo>.*?' +
1251                 '<url quality="hd">(.*?)</url>',
1252             re.DOTALL,
1253             [
1254                 (1, 'id',    u'could not extract video id: %s' % url),
1255                 (2, 'title', u'could not extract video title: %s' % url),
1256                 (3, 'date',  u'could not extract video date: %s' % url),
1257                 (4, 'url',   u'could not extract video url: %s' % url)
1258             ]
1259         )
1260
1261         return {
1262             'id':           info.get('id'),
1263             'url':          compat_urllib_parse.unquote(info.get('url')),
1264             'uploader':     u'arte.tv',
1265             'upload_date':  info.get('date'),
1266             'title':        info.get('title').decode('utf-8'),
1267             'ext':          u'mp4',
1268             'format':       u'NA',
1269             'player_url':   None,
1270         }
1271
1272     def _real_extract(self, url):
1273         video_id = url.split('/')[-1]
1274         self.report_extraction(video_id)
1275
1276         if re.search(self._LIVE_URL, video_id) is not None:
1277             self.extractLiveStream(url)
1278             return
1279         else:
1280             info = self.extractPlus7Stream(url)
1281
1282         return [info]
1283
1284
1285 class GenericIE(InfoExtractor):
1286     """Generic last-resort information extractor."""
1287
1288     _VALID_URL = r'.*'
1289     IE_NAME = u'generic'
1290
1291     def report_download_webpage(self, video_id):
1292         """Report webpage download."""
1293         if not self._downloader.params.get('test', False):
1294             self._downloader.report_warning(u'Falling back on generic information extractor.')
1295         super(GenericIE, self).report_download_webpage(video_id)
1296
1297     def report_following_redirect(self, new_url):
1298         """Report information extraction."""
1299         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1300
1301     def _test_redirect(self, url):
1302         """Check if it is a redirect, like url shorteners, in case return the new url."""
1303         class HeadRequest(compat_urllib_request.Request):
1304             def get_method(self):
1305                 return "HEAD"
1306
1307         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1308             """
1309             Subclass the HTTPRedirectHandler to make it use our
1310             HeadRequest also on the redirected URL
1311             """
1312             def redirect_request(self, req, fp, code, msg, headers, newurl):
1313                 if code in (301, 302, 303, 307):
1314                     newurl = newurl.replace(' ', '%20')
1315                     newheaders = dict((k,v) for k,v in req.headers.items()
1316                                       if k.lower() not in ("content-length", "content-type"))
1317                     return HeadRequest(newurl,
1318                                        headers=newheaders,
1319                                        origin_req_host=req.get_origin_req_host(),
1320                                        unverifiable=True)
1321                 else:
1322                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1323
1324         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1325             """
1326             Fallback to GET if HEAD is not allowed (405 HTTP error)
1327             """
1328             def http_error_405(self, req, fp, code, msg, headers):
1329                 fp.read()
1330                 fp.close()
1331
1332                 newheaders = dict((k,v) for k,v in req.headers.items()
1333                                   if k.lower() not in ("content-length", "content-type"))
1334                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1335                                                  headers=newheaders,
1336                                                  origin_req_host=req.get_origin_req_host(),
1337                                                  unverifiable=True))
1338
1339         # Build our opener
1340         opener = compat_urllib_request.OpenerDirector()
1341         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342                         HTTPMethodFallback, HEADRedirectHandler,
1343                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344             opener.add_handler(handler())
1345
1346         response = opener.open(HeadRequest(url))
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             self._downloader.report_error(u'Invalid URL: %s' % url)
1366             return
1367
1368         self.report_extraction(video_id)
1369         # Start with something easy: JW Player in SWFObject
1370         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371         if mobj is None:
1372             # Broaden the search a little bit
1373             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374         if mobj is None:
1375             # Broaden the search a little bit: JWPlayer JS loader
1376             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377         if mobj is None:
1378             self._downloader.report_error(u'Invalid URL: %s' % url)
1379             return
1380
1381         # It's possible that one of the regexes
1382         # matched, but returned an empty group:
1383         if mobj.group(1) is None:
1384             self._downloader.report_error(u'Invalid URL: %s' % url)
1385             return
1386
1387         video_url = compat_urllib_parse.unquote(mobj.group(1))
1388         video_id = os.path.basename(video_url)
1389
1390         # here's a fun little line of code for you:
1391         video_extension = os.path.splitext(video_id)[1][1:]
1392         video_id = os.path.splitext(video_id)[0]
1393
1394         # it's tempting to parse this further, but you would
1395         # have to take into account all the variations like
1396         #   Video Title - Site Name
1397         #   Site Name | Video Title
1398         #   Video Title - Tagline | Site Name
1399         # and so on and so forth; it's just not practical
1400         mobj = re.search(r'<title>(.*)</title>', webpage)
1401         if mobj is None:
1402             self._downloader.report_error(u'unable to extract title')
1403             return
1404         video_title = mobj.group(1)
1405
1406         # video uploader is domain name
1407         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408         if mobj is None:
1409             self._downloader.report_error(u'unable to extract title')
1410             return
1411         video_uploader = mobj.group(1)
1412
1413         return [{
1414             'id':       video_id,
1415             'url':      video_url,
1416             'uploader': video_uploader,
1417             'upload_date':  None,
1418             'title':    video_title,
1419             'ext':      video_extension,
1420         }]
1421
1422
1423 class YoutubeSearchIE(InfoExtractor):
1424     """Information Extractor for YouTube search queries."""
1425     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427     _max_youtube_results = 1000
1428     IE_NAME = u'youtube:search'
1429
1430     def report_download_page(self, query, pagenum):
1431         """Report attempt to download search page with given number."""
1432         query = query.decode(preferredencoding())
1433         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1434
1435     def _real_extract(self, query):
1436         mobj = re.match(self._VALID_URL, query)
1437         if mobj is None:
1438             self._downloader.report_error(u'invalid search query "%s"' % query)
1439             return
1440
1441         prefix, query = query.split(':')
1442         prefix = prefix[8:]
1443         query = query.encode('utf-8')
1444         if prefix == '':
1445             return self._get_n_results(query, 1)
1446         elif prefix == 'all':
1447             self._get_n_results(query, self._max_youtube_results)
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 return self._get_n_results(query, n)
1458             except ValueError: # parsing prefix as integer fails
1459                 return self._get_n_results(query, 1)
1460
1461     def _get_n_results(self, query, n):
1462         """Get a specified number of results for a query"""
1463
1464         video_ids = []
1465         pagenum = 0
1466         limit = n
1467
1468         while (50 * pagenum) < limit:
1469             self.report_download_page(query, pagenum+1)
1470             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471             request = compat_urllib_request.Request(result_url)
1472             try:
1473                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1476                 return
1477             api_response = json.loads(data)['data']
1478
1479             if not 'items' in api_response:
1480                 self._downloader.report_error(u'[youtube] No video results')
1481                 return
1482
1483             new_ids = list(video['id'] for video in api_response['items'])
1484             video_ids += new_ids
1485
1486             limit = min(n, api_response['totalItems'])
1487             pagenum += 1
1488
1489         if len(video_ids) > n:
1490             video_ids = video_ids[:n]
1491         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1492         return videos
1493
1494
1495 class GoogleSearchIE(InfoExtractor):
1496     """Information Extractor for Google Video search queries."""
1497     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1498     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1499     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1500     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1501     _max_google_results = 1000
1502     IE_NAME = u'video.google:search'
1503
1504     def report_download_page(self, query, pagenum):
1505         """Report attempt to download playlist page with given number."""
1506         query = query.decode(preferredencoding())
1507         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1508
1509     def _real_extract(self, query):
1510         mobj = re.match(self._VALID_URL, query)
1511         if mobj is None:
1512             self._downloader.report_error(u'invalid search query "%s"' % query)
1513             return
1514
1515         prefix, query = query.split(':')
1516         prefix = prefix[8:]
1517         query = query.encode('utf-8')
1518         if prefix == '':
1519             self._download_n_results(query, 1)
1520             return
1521         elif prefix == 'all':
1522             self._download_n_results(query, self._max_google_results)
1523             return
1524         else:
1525             try:
1526                 n = int(prefix)
1527                 if n <= 0:
1528                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1529                     return
1530                 elif n > self._max_google_results:
1531                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1532                     n = self._max_google_results
1533                 self._download_n_results(query, n)
1534                 return
1535             except ValueError: # parsing prefix as integer fails
1536                 self._download_n_results(query, 1)
1537                 return
1538
1539     def _download_n_results(self, query, n):
1540         """Downloads a specified number of results for a query"""
1541
1542         video_ids = []
1543         pagenum = 0
1544
1545         while True:
1546             self.report_download_page(query, pagenum)
1547             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1548             request = compat_urllib_request.Request(result_url)
1549             try:
1550                 page = compat_urllib_request.urlopen(request).read()
1551             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1553                 return
1554
1555             # Extract video identifiers
1556             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1557                 video_id = mobj.group(1)
1558                 if video_id not in video_ids:
1559                     video_ids.append(video_id)
1560                     if len(video_ids) == n:
1561                         # Specified n videos reached
1562                         for id in video_ids:
1563                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1564                         return
1565
1566             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567                 for id in video_ids:
1568                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1569                 return
1570
1571             pagenum = pagenum + 1
1572
1573
1574 class YahooSearchIE(InfoExtractor):
1575     """Information Extractor for Yahoo! Video search queries."""
1576
1577     _WORKING = False
1578     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1579     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1580     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1581     _MORE_PAGES_INDICATOR = r'\s*Next'
1582     _max_yahoo_results = 1000
1583     IE_NAME = u'video.yahoo:search'
1584
1585     def report_download_page(self, query, pagenum):
1586         """Report attempt to download playlist page with given number."""
1587         query = query.decode(preferredencoding())
1588         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1589
1590     def _real_extract(self, query):
1591         mobj = re.match(self._VALID_URL, query)
1592         if mobj is None:
1593             self._downloader.report_error(u'invalid search query "%s"' % query)
1594             return
1595
1596         prefix, query = query.split(':')
1597         prefix = prefix[8:]
1598         query = query.encode('utf-8')
1599         if prefix == '':
1600             self._download_n_results(query, 1)
1601             return
1602         elif prefix == 'all':
1603             self._download_n_results(query, self._max_yahoo_results)
1604             return
1605         else:
1606             try:
1607                 n = int(prefix)
1608                 if n <= 0:
1609                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1610                     return
1611                 elif n > self._max_yahoo_results:
1612                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1613                     n = self._max_yahoo_results
1614                 self._download_n_results(query, n)
1615                 return
1616             except ValueError: # parsing prefix as integer fails
1617                 self._download_n_results(query, 1)
1618                 return
1619
1620     def _download_n_results(self, query, n):
1621         """Downloads a specified number of results for a query"""
1622
1623         video_ids = []
1624         already_seen = set()
1625         pagenum = 1
1626
1627         while True:
1628             self.report_download_page(query, pagenum)
1629             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1630             request = compat_urllib_request.Request(result_url)
1631             try:
1632                 page = compat_urllib_request.urlopen(request).read()
1633             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1634                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1635                 return
1636
1637             # Extract video identifiers
1638             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1639                 video_id = mobj.group(1)
1640                 if video_id not in already_seen:
1641                     video_ids.append(video_id)
1642                     already_seen.add(video_id)
1643                     if len(video_ids) == n:
1644                         # Specified n videos reached
1645                         for id in video_ids:
1646                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1647                         return
1648
1649             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1650                 for id in video_ids:
1651                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1652                 return
1653
1654             pagenum = pagenum + 1
1655
1656
1657 class YoutubePlaylistIE(InfoExtractor):
1658     """Information Extractor for YouTube playlists."""
1659
1660     _VALID_URL = r"""(?:
1661                         (?:https?://)?
1662                         (?:\w+\.)?
1663                         youtube\.com/
1664                         (?:
1665                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1666                            \? (?:.*?&)*? (?:p|a|list)=
1667                         |  p/
1668                         )
1669                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1670                         .*
1671                      |
1672                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1673                      )"""
1674     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1675     _MAX_RESULTS = 50
1676     IE_NAME = u'youtube:playlist'
1677
1678     @classmethod
1679     def suitable(cls, url):
1680         """Receives a URL and returns True if suitable for this IE."""
1681         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1682
1683     def _real_extract(self, url):
1684         # Extract playlist id
1685         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1686         if mobj is None:
1687             self._downloader.report_error(u'invalid url: %s' % url)
1688             return
1689
1690         # Download playlist videos from API
1691         playlist_id = mobj.group(1) or mobj.group(2)
1692         page_num = 1
1693         videos = []
1694
1695         while True:
1696             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1697             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1698
1699             try:
1700                 response = json.loads(page)
1701             except ValueError as err:
1702                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1703                 return
1704
1705             if 'feed' not in response:
1706                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1707                 return
1708             playlist_title = response['feed']['title']['$t']
1709             if 'entry' not in response['feed']:
1710                 # Number of videos is a multiple of self._MAX_RESULTS
1711                 break
1712
1713             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1714                         for entry in response['feed']['entry']
1715                         if 'content' in entry ]
1716
1717             if len(response['feed']['entry']) < self._MAX_RESULTS:
1718                 break
1719             page_num += 1
1720
1721         videos = [v[1] for v in sorted(videos)]
1722
1723         url_results = [self.url_result(url, 'Youtube') for url in videos]
1724         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1725
1726
1727 class YoutubeChannelIE(InfoExtractor):
1728     """Information Extractor for YouTube channels."""
1729
1730     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1731     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1732     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1733     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1734     IE_NAME = u'youtube:channel'
1735
1736     def extract_videos_from_page(self, page):
1737         ids_in_page = []
1738         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1739             if mobj.group(1) not in ids_in_page:
1740                 ids_in_page.append(mobj.group(1))
1741         return ids_in_page
1742
1743     def _real_extract(self, url):
1744         # Extract channel id
1745         mobj = re.match(self._VALID_URL, url)
1746         if mobj is None:
1747             self._downloader.report_error(u'invalid url: %s' % url)
1748             return
1749
1750         # Download channel page
1751         channel_id = mobj.group(1)
1752         video_ids = []
1753         pagenum = 1
1754
1755         url = self._TEMPLATE_URL % (channel_id, pagenum)
1756         page = self._download_webpage(url, channel_id,
1757                                       u'Downloading page #%s' % pagenum)
1758
1759         # Extract video identifiers
1760         ids_in_page = self.extract_videos_from_page(page)
1761         video_ids.extend(ids_in_page)
1762
1763         # Download any subsequent channel pages using the json-based channel_ajax query
1764         if self._MORE_PAGES_INDICATOR in page:
1765             while True:
1766                 pagenum = pagenum + 1
1767
1768                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1769                 page = self._download_webpage(url, channel_id,
1770                                               u'Downloading page #%s' % pagenum)
1771
1772                 page = json.loads(page)
1773
1774                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1775                 video_ids.extend(ids_in_page)
1776
1777                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1778                     break
1779
1780         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1781
1782         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1783         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1784         return [self.playlist_result(url_entries, channel_id)]
1785
1786
1787 class YoutubeUserIE(InfoExtractor):
1788     """Information Extractor for YouTube users."""
1789
1790     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1791     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1792     _GDATA_PAGE_SIZE = 50
1793     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1794     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1795     IE_NAME = u'youtube:user'
1796
1797     def _real_extract(self, url):
1798         # Extract username
1799         mobj = re.match(self._VALID_URL, url)
1800         if mobj is None:
1801             self._downloader.report_error(u'invalid url: %s' % url)
1802             return
1803
1804         username = mobj.group(1)
1805
1806         # Download video ids using YouTube Data API. Result size per
1807         # query is limited (currently to 50 videos) so we need to query
1808         # page by page until there are no video ids - it means we got
1809         # all of them.
1810
1811         video_ids = []
1812         pagenum = 0
1813
1814         while True:
1815             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1816
1817             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1818             page = self._download_webpage(gdata_url, username,
1819                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1820
1821             # Extract video identifiers
1822             ids_in_page = []
1823
1824             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1825                 if mobj.group(1) not in ids_in_page:
1826                     ids_in_page.append(mobj.group(1))
1827
1828             video_ids.extend(ids_in_page)
1829
1830             # A little optimization - if current page is not
1831             # "full", ie. does not contain PAGE_SIZE video ids then
1832             # we can assume that this page is the last one - there
1833             # are no more ids on further pages - no need to query
1834             # again.
1835
1836             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1837                 break
1838
1839             pagenum += 1
1840
1841         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1842         url_results = [self.url_result(url, 'Youtube') for url in urls]
1843         return [self.playlist_result(url_results, playlist_title = username)]
1844
1845
1846 class BlipTVUserIE(InfoExtractor):
1847     """Information Extractor for blip.tv users."""
1848
1849     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1850     _PAGE_SIZE = 12
1851     IE_NAME = u'blip.tv:user'
1852
1853     def _real_extract(self, url):
1854         # Extract username
1855         mobj = re.match(self._VALID_URL, url)
1856         if mobj is None:
1857             self._downloader.report_error(u'invalid url: %s' % url)
1858             return
1859
1860         username = mobj.group(1)
1861
1862         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1863
1864         page = self._download_webpage(url, username, u'Downloading user page')
1865         mobj = re.search(r'data-users-id="([^"]+)"', page)
1866         page_base = page_base % mobj.group(1)
1867
1868
1869         # Download video ids using BlipTV Ajax calls. Result size per
1870         # query is limited (currently to 12 videos) so we need to query
1871         # page by page until there are no video ids - it means we got
1872         # all of them.
1873
1874         video_ids = []
1875         pagenum = 1
1876
1877         while True:
1878             url = page_base + "&page=" + str(pagenum)
1879             page = self._download_webpage(url, username,
1880                                           u'Downloading video ids from page %d' % pagenum)
1881
1882             # Extract video identifiers
1883             ids_in_page = []
1884
1885             for mobj in re.finditer(r'href="/([^"]+)"', page):
1886                 if mobj.group(1) not in ids_in_page:
1887                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1888
1889             video_ids.extend(ids_in_page)
1890
1891             # A little optimization - if current page is not
1892             # "full", ie. does not contain PAGE_SIZE video ids then
1893             # we can assume that this page is the last one - there
1894             # are no more ids on further pages - no need to query
1895             # again.
1896
1897             if len(ids_in_page) < self._PAGE_SIZE:
1898                 break
1899
1900             pagenum += 1
1901
1902         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1903         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1904         return [self.playlist_result(url_entries, playlist_title = username)]
1905
1906
1907 class DepositFilesIE(InfoExtractor):
1908     """Information extractor for depositfiles.com"""
1909
1910     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1911
1912     def _real_extract(self, url):
1913         file_id = url.split('/')[-1]
1914         # Rebuild url in english locale
1915         url = 'http://depositfiles.com/en/files/' + file_id
1916
1917         # Retrieve file webpage with 'Free download' button pressed
1918         free_download_indication = { 'gateway_result' : '1' }
1919         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1920         try:
1921             self.report_download_webpage(file_id)
1922             webpage = compat_urllib_request.urlopen(request).read()
1923         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1925             return
1926
1927         # Search for the real file URL
1928         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1929         if (mobj is None) or (mobj.group(1) is None):
1930             # Try to figure out reason of the error.
1931             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1932             if (mobj is not None) and (mobj.group(1) is not None):
1933                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1934                 self._downloader.report_error(u'%s' % restriction_message)
1935             else:
1936                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1937             return
1938
1939         file_url = mobj.group(1)
1940         file_extension = os.path.splitext(file_url)[1][1:]
1941
1942         # Search for file title
1943         mobj = re.search(r'<b title="(.*?)">', webpage)
1944         if mobj is None:
1945             self._downloader.report_error(u'unable to extract title')
1946             return
1947         file_title = mobj.group(1).decode('utf-8')
1948
1949         return [{
1950             'id':       file_id.decode('utf-8'),
1951             'url':      file_url.decode('utf-8'),
1952             'uploader': None,
1953             'upload_date':  None,
1954             'title':    file_title,
1955             'ext':      file_extension.decode('utf-8'),
1956         }]
1957
1958
1959 class FacebookIE(InfoExtractor):
1960     """Information Extractor for Facebook"""
1961
1962     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1963     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1964     _NETRC_MACHINE = 'facebook'
1965     IE_NAME = u'facebook'
1966
1967     def report_login(self):
1968         """Report attempt to log in."""
1969         self.to_screen(u'Logging in')
1970
1971     def _real_initialize(self):
1972         if self._downloader is None:
1973             return
1974
1975         useremail = None
1976         password = None
1977         downloader_params = self._downloader.params
1978
1979         # Attempt to use provided username and password or .netrc data
1980         if downloader_params.get('username', None) is not None:
1981             useremail = downloader_params['username']
1982             password = downloader_params['password']
1983         elif downloader_params.get('usenetrc', False):
1984             try:
1985                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1986                 if info is not None:
1987                     useremail = info[0]
1988                     password = info[2]
1989                 else:
1990                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1991             except (IOError, netrc.NetrcParseError) as err:
1992                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1993                 return
1994
1995         if useremail is None:
1996             return
1997
1998         # Log in
1999         login_form = {
2000             'email': useremail,
2001             'pass': password,
2002             'login': 'Log+In'
2003             }
2004         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2005         try:
2006             self.report_login()
2007             login_results = compat_urllib_request.urlopen(request).read()
2008             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2009                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2010                 return
2011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2013             return
2014
2015     def _real_extract(self, url):
2016         mobj = re.match(self._VALID_URL, url)
2017         if mobj is None:
2018             self._downloader.report_error(u'invalid URL: %s' % url)
2019             return
2020         video_id = mobj.group('ID')
2021
2022         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2023         webpage = self._download_webpage(url, video_id)
2024
2025         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2026         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2027         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2028         if not m:
2029             raise ExtractorError(u'Cannot parse data')
2030         data = dict(json.loads(m.group(1)))
2031         params_raw = compat_urllib_parse.unquote(data['params'])
2032         params = json.loads(params_raw)
2033         video_data = params['video_data'][0]
2034         video_url = video_data.get('hd_src')
2035         if not video_url:
2036             video_url = video_data['sd_src']
2037         if not video_url:
2038             raise ExtractorError(u'Cannot find video URL')
2039         video_duration = int(video_data['video_duration'])
2040         thumbnail = video_data['thumbnail_src']
2041
2042         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2043         if not m:
2044             raise ExtractorError(u'Cannot find title in webpage')
2045         video_title = unescapeHTML(m.group(1))
2046
2047         info = {
2048             'id': video_id,
2049             'title': video_title,
2050             'url': video_url,
2051             'ext': 'mp4',
2052             'duration': video_duration,
2053             'thumbnail': thumbnail,
2054         }
2055         return [info]
2056
2057
2058 class BlipTVIE(InfoExtractor):
2059     """Information extractor for blip.tv"""
2060
2061     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2062     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2063     IE_NAME = u'blip.tv'
2064
2065     def report_direct_download(self, title):
2066         """Report information extraction."""
2067         self.to_screen(u'%s: Direct download detected' % title)
2068
2069     def _real_extract(self, url):
2070         mobj = re.match(self._VALID_URL, url)
2071         if mobj is None:
2072             self._downloader.report_error(u'invalid URL: %s' % url)
2073             return
2074
2075         urlp = compat_urllib_parse_urlparse(url)
2076         if urlp.path.startswith('/play/'):
2077             request = compat_urllib_request.Request(url)
2078             response = compat_urllib_request.urlopen(request)
2079             redirecturl = response.geturl()
2080             rurlp = compat_urllib_parse_urlparse(redirecturl)
2081             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2082             url = 'http://blip.tv/a/a-' + file_id
2083             return self._real_extract(url)
2084
2085
2086         if '?' in url:
2087             cchar = '&'
2088         else:
2089             cchar = '?'
2090         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2091         request = compat_urllib_request.Request(json_url)
2092         request.add_header('User-Agent', 'iTunes/10.6.1')
2093         self.report_extraction(mobj.group(1))
2094         info = None
2095         try:
2096             urlh = compat_urllib_request.urlopen(request)
2097             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2098                 basename = url.split('/')[-1]
2099                 title,ext = os.path.splitext(basename)
2100                 title = title.decode('UTF-8')
2101                 ext = ext.replace('.', '')
2102                 self.report_direct_download(title)
2103                 info = {
2104                     'id': title,
2105                     'url': url,
2106                     'uploader': None,
2107                     'upload_date': None,
2108                     'title': title,
2109                     'ext': ext,
2110                     'urlhandle': urlh
2111                 }
2112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2113             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2114         if info is None: # Regular URL
2115             try:
2116                 json_code_bytes = urlh.read()
2117                 json_code = json_code_bytes.decode('utf-8')
2118             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2119                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2120                 return
2121
2122             try:
2123                 json_data = json.loads(json_code)
2124                 if 'Post' in json_data:
2125                     data = json_data['Post']
2126                 else:
2127                     data = json_data
2128
2129                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2130                 video_url = data['media']['url']
2131                 umobj = re.match(self._URL_EXT, video_url)
2132                 if umobj is None:
2133                     raise ValueError('Can not determine filename extension')
2134                 ext = umobj.group(1)
2135
2136                 info = {
2137                     'id': data['item_id'],
2138                     'url': video_url,
2139                     'uploader': data['display_name'],
2140                     'upload_date': upload_date,
2141                     'title': data['title'],
2142                     'ext': ext,
2143                     'format': data['media']['mimeType'],
2144                     'thumbnail': data['thumbnailUrl'],
2145                     'description': data['description'],
2146                     'player_url': data['embedUrl'],
2147                     'user_agent': 'iTunes/10.6.1',
2148                 }
2149             except (ValueError,KeyError) as err:
2150                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2151                 return
2152
2153         return [info]
2154
2155
2156 class MyVideoIE(InfoExtractor):
2157     """Information Extractor for myvideo.de."""
2158
2159     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2160     IE_NAME = u'myvideo'
2161
2162     def _real_extract(self,url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._download.report_error(u'invalid URL: %s' % url)
2166             return
2167
2168         video_id = mobj.group(1)
2169
2170         # Get video webpage
2171         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2172         webpage = self._download_webpage(webpage_url, video_id)
2173
2174         self.report_extraction(video_id)
2175         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2176                  webpage)
2177         if mobj is None:
2178             self._downloader.report_error(u'unable to extract media URL')
2179             return
2180         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2181
2182         mobj = re.search('<title>([^<]+)</title>', webpage)
2183         if mobj is None:
2184             self._downloader.report_error(u'unable to extract title')
2185             return
2186
2187         video_title = mobj.group(1)
2188
2189         return [{
2190             'id':       video_id,
2191             'url':      video_url,
2192             'uploader': None,
2193             'upload_date':  None,
2194             'title':    video_title,
2195             'ext':      u'flv',
2196         }]
2197
2198 class ComedyCentralIE(InfoExtractor):
2199     """Information extractor for The Daily Show and Colbert Report """
2200
2201     # urls can be abbreviations like :thedailyshow or :colbert
2202     # urls for episodes like:
2203     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2204     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2205     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2206     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2207                       |(https?://)?(www\.)?
2208                           (?P<showname>thedailyshow|colbertnation)\.com/
2209                          (full-episodes/(?P<episode>.*)|
2210                           (?P<clip>
2211                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2212                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2213                      $"""
2214
2215     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2216
2217     _video_extensions = {
2218         '3500': 'mp4',
2219         '2200': 'mp4',
2220         '1700': 'mp4',
2221         '1200': 'mp4',
2222         '750': 'mp4',
2223         '400': 'mp4',
2224     }
2225     _video_dimensions = {
2226         '3500': '1280x720',
2227         '2200': '960x540',
2228         '1700': '768x432',
2229         '1200': '640x360',
2230         '750': '512x288',
2231         '400': '384x216',
2232     }
2233
2234     @classmethod
2235     def suitable(cls, url):
2236         """Receives a URL and returns True if suitable for this IE."""
2237         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2238
2239     def _print_formats(self, formats):
2240         print('Available formats:')
2241         for x in formats:
2242             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2243
2244
2245     def _real_extract(self, url):
2246         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2247         if mobj is None:
2248             self._downloader.report_error(u'invalid URL: %s' % url)
2249             return
2250
2251         if mobj.group('shortname'):
2252             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253                 url = u'http://www.thedailyshow.com/full-episodes/'
2254             else:
2255                 url = u'http://www.colbertnation.com/full-episodes/'
2256             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257             assert mobj is not None
2258
2259         if mobj.group('clip'):
2260             if mobj.group('showname') == 'thedailyshow':
2261                 epTitle = mobj.group('tdstitle')
2262             else:
2263                 epTitle = mobj.group('cntitle')
2264             dlNewest = False
2265         else:
2266             dlNewest = not mobj.group('episode')
2267             if dlNewest:
2268                 epTitle = mobj.group('showname')
2269             else:
2270                 epTitle = mobj.group('episode')
2271
2272         self.report_extraction(epTitle)
2273         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2274         if dlNewest:
2275             url = htmlHandle.geturl()
2276             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277             if mobj is None:
2278                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2279             if mobj.group('episode') == '':
2280                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2281             epTitle = mobj.group('episode')
2282
2283         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2284
2285         if len(mMovieParams) == 0:
2286             # The Colbert Report embeds the information in a without
2287             # a URL prefix; so extract the alternate reference
2288             # and then add the URL prefix manually.
2289
2290             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2291             if len(altMovieParams) == 0:
2292                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2293             else:
2294                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2295
2296         uri = mMovieParams[0][1]
2297         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2298         indexXml = self._download_webpage(indexUrl, epTitle,
2299                                           u'Downloading show index',
2300                                           u'unable to download episode index')
2301
2302         results = []
2303
2304         idoc = xml.etree.ElementTree.fromstring(indexXml)
2305         itemEls = idoc.findall('.//item')
2306         for partNum,itemEl in enumerate(itemEls):
2307             mediaId = itemEl.findall('./guid')[0].text
2308             shortMediaId = mediaId.split(':')[-1]
2309             showId = mediaId.split(':')[-2].replace('.com', '')
2310             officialTitle = itemEl.findall('./title')[0].text
2311             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2312
2313             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2314                         compat_urllib_parse.urlencode({'uri': mediaId}))
2315             configXml = self._download_webpage(configUrl, epTitle,
2316                                                u'Downloading configuration for %s' % shortMediaId)
2317
2318             cdoc = xml.etree.ElementTree.fromstring(configXml)
2319             turls = []
2320             for rendition in cdoc.findall('.//rendition'):
2321                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2322                 turls.append(finfo)
2323
2324             if len(turls) == 0:
2325                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2326                 continue
2327
2328             if self._downloader.params.get('listformats', None):
2329                 self._print_formats([i[0] for i in turls])
2330                 return
2331
2332             # For now, just pick the highest bitrate
2333             format,rtmp_video_url = turls[-1]
2334
2335             # Get the format arg from the arg stream
2336             req_format = self._downloader.params.get('format', None)
2337
2338             # Select format if we can find one
2339             for f,v in turls:
2340                 if f == req_format:
2341                     format, rtmp_video_url = f, v
2342                     break
2343
2344             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2345             if not m:
2346                 raise ExtractorError(u'Cannot transform RTMP url')
2347             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2348             video_url = base + m.group('finalid')
2349
2350             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2351             info = {
2352                 'id': shortMediaId,
2353                 'url': video_url,
2354                 'uploader': showId,
2355                 'upload_date': officialDate,
2356                 'title': effTitle,
2357                 'ext': 'mp4',
2358                 'format': format,
2359                 'thumbnail': None,
2360                 'description': officialTitle,
2361             }
2362             results.append(info)
2363
2364         return results
2365
2366
2367 class EscapistIE(InfoExtractor):
2368     """Information extractor for The Escapist """
2369
2370     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371     IE_NAME = u'escapist'
2372
2373     def _real_extract(self, url):
2374         mobj = re.match(self._VALID_URL, url)
2375         if mobj is None:
2376             self._downloader.report_error(u'invalid URL: %s' % url)
2377             return
2378         showName = mobj.group('showname')
2379         videoId = mobj.group('episode')
2380
2381         self.report_extraction(showName)
2382         webPage = self._download_webpage(url, showName)
2383
2384         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2385         description = unescapeHTML(descMatch.group(1))
2386         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2387         imgUrl = unescapeHTML(imgMatch.group(1))
2388         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2389         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2390         configUrlMatch = re.search('config=(.*)$', playerUrl)
2391         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2392
2393         configJSON = self._download_webpage(configUrl, showName,
2394                                             u'Downloading configuration',
2395                                             u'unable to download configuration')
2396
2397         # Technically, it's JavaScript, not JSON
2398         configJSON = configJSON.replace("'", '"')
2399
2400         try:
2401             config = json.loads(configJSON)
2402         except (ValueError,) as err:
2403             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2404             return
2405
2406         playlist = config['playlist']
2407         videoUrl = playlist[1]['url']
2408
2409         info = {
2410             'id': videoId,
2411             'url': videoUrl,
2412             'uploader': showName,
2413             'upload_date': None,
2414             'title': showName,
2415             'ext': 'mp4',
2416             'thumbnail': imgUrl,
2417             'description': description,
2418             'player_url': playerUrl,
2419         }
2420
2421         return [info]
2422
2423 class CollegeHumorIE(InfoExtractor):
2424     """Information extractor for collegehumor.com"""
2425
2426     _WORKING = False
2427     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2428     IE_NAME = u'collegehumor'
2429
2430     def report_manifest(self, video_id):
2431         """Report information extraction."""
2432         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2433
2434     def _real_extract(self, url):
2435         mobj = re.match(self._VALID_URL, url)
2436         if mobj is None:
2437             self._downloader.report_error(u'invalid URL: %s' % url)
2438             return
2439         video_id = mobj.group('videoid')
2440
2441         info = {
2442             'id': video_id,
2443             'uploader': None,
2444             'upload_date': None,
2445         }
2446
2447         self.report_extraction(video_id)
2448         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2449         try:
2450             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2453             return
2454
2455         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2456         try:
2457             videoNode = mdoc.findall('./video')[0]
2458             info['description'] = videoNode.findall('./description')[0].text
2459             info['title'] = videoNode.findall('./caption')[0].text
2460             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461             manifest_url = videoNode.findall('./file')[0].text
2462         except IndexError:
2463             self._downloader.report_error(u'Invalid metadata XML file')
2464             return
2465
2466         manifest_url += '?hdcore=2.10.3'
2467         self.report_manifest(video_id)
2468         try:
2469             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2472             return
2473
2474         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2475         try:
2476             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2477             node_id = media_node.attrib['url']
2478             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2479         except IndexError as err:
2480             self._downloader.report_error(u'Invalid manifest file')
2481             return
2482
2483         url_pr = compat_urllib_parse_urlparse(manifest_url)
2484         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2485
2486         info['url'] = url
2487         info['ext'] = 'f4f'
2488         return [info]
2489
2490
2491 class XVideosIE(InfoExtractor):
2492     """Information extractor for xvideos.com"""
2493
2494     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2495     IE_NAME = u'xvideos'
2496
2497     def _real_extract(self, url):
2498         mobj = re.match(self._VALID_URL, url)
2499         if mobj is None:
2500             self._downloader.report_error(u'invalid URL: %s' % url)
2501             return
2502         video_id = mobj.group(1)
2503
2504         webpage = self._download_webpage(url, video_id)
2505
2506         self.report_extraction(video_id)
2507
2508
2509         # Extract video URL
2510         mobj = re.search(r'flv_url=(.+?)&', webpage)
2511         if mobj is None:
2512             self._downloader.report_error(u'unable to extract video url')
2513             return
2514         video_url = compat_urllib_parse.unquote(mobj.group(1))
2515
2516
2517         # Extract title
2518         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2519         if mobj is None:
2520             self._downloader.report_error(u'unable to extract video title')
2521             return
2522         video_title = mobj.group(1)
2523
2524
2525         # Extract video thumbnail
2526         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2527         if mobj is None:
2528             self._downloader.report_error(u'unable to extract video thumbnail')
2529             return
2530         video_thumbnail = mobj.group(0)
2531
2532         info = {
2533             'id': video_id,
2534             'url': video_url,
2535             'uploader': None,
2536             'upload_date': None,
2537             'title': video_title,
2538             'ext': 'flv',
2539             'thumbnail': video_thumbnail,
2540             'description': None,
2541         }
2542
2543         return [info]
2544
2545
2546 class SoundcloudIE(InfoExtractor):
2547     """Information extractor for soundcloud.com
2548        To access the media, the uid of the song and a stream token
2549        must be extracted from the page source and the script must make
2550        a request to media.soundcloud.com/crossdomain.xml. Then
2551        the media can be grabbed by requesting from an url composed
2552        of the stream token and uid
2553      """
2554
2555     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2556     IE_NAME = u'soundcloud'
2557
2558     def report_resolve(self, video_id):
2559         """Report information extraction."""
2560         self.to_screen(u'%s: Resolving id' % video_id)
2561
2562     def _real_extract(self, url):
2563         mobj = re.match(self._VALID_URL, url)
2564         if mobj is None:
2565             self._downloader.report_error(u'invalid URL: %s' % url)
2566             return
2567
2568         # extract uploader (which is in the url)
2569         uploader = mobj.group(1)
2570         # extract simple title (uploader + slug of song title)
2571         slug_title =  mobj.group(2)
2572         simple_title = uploader + u'-' + slug_title
2573         full_title = '%s/%s' % (uploader, slug_title)
2574
2575         self.report_resolve(full_title)
2576
2577         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2578         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2579         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2580
2581         info = json.loads(info_json)
2582         video_id = info['id']
2583         self.report_extraction(full_title)
2584
2585         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2586         stream_json = self._download_webpage(streams_url, full_title,
2587                                              u'Downloading stream definitions',
2588                                              u'unable to download stream definitions')
2589
2590         streams = json.loads(stream_json)
2591         mediaURL = streams['http_mp3_128_url']
2592         upload_date = unified_strdate(info['created_at'])
2593
2594         return [{
2595             'id':       info['id'],
2596             'url':      mediaURL,
2597             'uploader': info['user']['username'],
2598             'upload_date': upload_date,
2599             'title':    info['title'],
2600             'ext':      u'mp3',
2601             'description': info['description'],
2602         }]
2603
2604 class SoundcloudSetIE(InfoExtractor):
2605     """Information extractor for soundcloud.com sets
2606        To access the media, the uid of the song and a stream token
2607        must be extracted from the page source and the script must make
2608        a request to media.soundcloud.com/crossdomain.xml. Then
2609        the media can be grabbed by requesting from an url composed
2610        of the stream token and uid
2611      """
2612
2613     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2614     IE_NAME = u'soundcloud:set'
2615
2616     def report_resolve(self, video_id):
2617         """Report information extraction."""
2618         self.to_screen(u'%s: Resolving id' % video_id)
2619
2620     def _real_extract(self, url):
2621         mobj = re.match(self._VALID_URL, url)
2622         if mobj is None:
2623             self._downloader.report_error(u'invalid URL: %s' % url)
2624             return
2625
2626         # extract uploader (which is in the url)
2627         uploader = mobj.group(1)
2628         # extract simple title (uploader + slug of song title)
2629         slug_title =  mobj.group(2)
2630         simple_title = uploader + u'-' + slug_title
2631         full_title = '%s/sets/%s' % (uploader, slug_title)
2632
2633         self.report_resolve(full_title)
2634
2635         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2636         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2637         info_json = self._download_webpage(resolv_url, full_title)
2638
2639         videos = []
2640         info = json.loads(info_json)
2641         if 'errors' in info:
2642             for err in info['errors']:
2643                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2644             return
2645
2646         self.report_extraction(full_title)
2647         for track in info['tracks']:
2648             video_id = track['id']
2649
2650             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2652
2653             self.report_extraction(video_id)
2654             streams = json.loads(stream_json)
2655             mediaURL = streams['http_mp3_128_url']
2656
2657             videos.append({
2658                 'id':       video_id,
2659                 'url':      mediaURL,
2660                 'uploader': track['user']['username'],
2661                 'upload_date':  unified_strdate(track['created_at']),
2662                 'title':    track['title'],
2663                 'ext':      u'mp3',
2664                 'description': track['description'],
2665             })
2666         return videos
2667
2668
2669 class InfoQIE(InfoExtractor):
2670     """Information extractor for infoq.com"""
2671     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2672
2673     def _real_extract(self, url):
2674         mobj = re.match(self._VALID_URL, url)
2675         if mobj is None:
2676             self._downloader.report_error(u'invalid URL: %s' % url)
2677             return
2678
2679         webpage = self._download_webpage(url, video_id=url)
2680         self.report_extraction(url)
2681
2682         # Extract video URL
2683         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2684         if mobj is None:
2685             self._downloader.report_error(u'unable to extract video url')
2686             return
2687         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2688         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2689
2690         # Extract title
2691         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2692         if mobj is None:
2693             self._downloader.report_error(u'unable to extract video title')
2694             return
2695         video_title = mobj.group(1)
2696
2697         # Extract description
2698         video_description = u'No description available.'
2699         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2700         if mobj is not None:
2701             video_description = mobj.group(1)
2702
2703         video_filename = video_url.split('/')[-1]
2704         video_id, extension = video_filename.split('.')
2705
2706         info = {
2707             'id': video_id,
2708             'url': video_url,
2709             'uploader': None,
2710             'upload_date': None,
2711             'title': video_title,
2712             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2713             'thumbnail': None,
2714             'description': video_description,
2715         }
2716
2717         return [info]
2718
2719 class MixcloudIE(InfoExtractor):
2720     """Information extractor for www.mixcloud.com"""
2721
2722     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2723     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2724     IE_NAME = u'mixcloud'
2725
2726     def report_download_json(self, file_id):
2727         """Report JSON download."""
2728         self.to_screen(u'Downloading json')
2729
2730     def get_urls(self, jsonData, fmt, bitrate='best'):
2731         """Get urls from 'audio_formats' section in json"""
2732         file_url = None
2733         try:
2734             bitrate_list = jsonData[fmt]
2735             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2736                 bitrate = max(bitrate_list) # select highest
2737
2738             url_list = jsonData[fmt][bitrate]
2739         except TypeError: # we have no bitrate info.
2740             url_list = jsonData[fmt]
2741         return url_list
2742
2743     def check_urls(self, url_list):
2744         """Returns 1st active url from list"""
2745         for url in url_list:
2746             try:
2747                 compat_urllib_request.urlopen(url)
2748                 return url
2749             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2750                 url = None
2751
2752         return None
2753
2754     def _print_formats(self, formats):
2755         print('Available formats:')
2756         for fmt in formats.keys():
2757             for b in formats[fmt]:
2758                 try:
2759                     ext = formats[fmt][b][0]
2760                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2761                 except TypeError: # we have no bitrate info
2762                     ext = formats[fmt][0]
2763                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2764                     break
2765
2766     def _real_extract(self, url):
2767         mobj = re.match(self._VALID_URL, url)
2768         if mobj is None:
2769             self._downloader.report_error(u'invalid URL: %s' % url)
2770             return
2771         # extract uploader & filename from url
2772         uploader = mobj.group(1).decode('utf-8')
2773         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2774
2775         # construct API request
2776         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2777         # retrieve .json file with links to files
2778         request = compat_urllib_request.Request(file_url)
2779         try:
2780             self.report_download_json(file_url)
2781             jsonData = compat_urllib_request.urlopen(request).read()
2782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2784             return
2785
2786         # parse JSON
2787         json_data = json.loads(jsonData)
2788         player_url = json_data['player_swf_url']
2789         formats = dict(json_data['audio_formats'])
2790
2791         req_format = self._downloader.params.get('format', None)
2792         bitrate = None
2793
2794         if self._downloader.params.get('listformats', None):
2795             self._print_formats(formats)
2796             return
2797
2798         if req_format is None or req_format == 'best':
2799             for format_param in formats.keys():
2800                 url_list = self.get_urls(formats, format_param)
2801                 # check urls
2802                 file_url = self.check_urls(url_list)
2803                 if file_url is not None:
2804                     break # got it!
2805         else:
2806             if req_format not in formats:
2807                 self._downloader.report_error(u'format is not available')
2808                 return
2809
2810             url_list = self.get_urls(formats, req_format)
2811             file_url = self.check_urls(url_list)
2812             format_param = req_format
2813
2814         return [{
2815             'id': file_id.decode('utf-8'),
2816             'url': file_url.decode('utf-8'),
2817             'uploader': uploader.decode('utf-8'),
2818             'upload_date': None,
2819             'title': json_data['name'],
2820             'ext': file_url.split('.')[-1].decode('utf-8'),
2821             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2822             'thumbnail': json_data['thumbnail_url'],
2823             'description': json_data['description'],
2824             'player_url': player_url.decode('utf-8'),
2825         }]
2826
2827 class StanfordOpenClassroomIE(InfoExtractor):
2828     """Information extractor for Stanford's Open ClassRoom"""
2829
2830     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2831     IE_NAME = u'stanfordoc'
2832
2833     def _real_extract(self, url):
2834         mobj = re.match(self._VALID_URL, url)
2835         if mobj is None:
2836             raise ExtractorError(u'Invalid URL: %s' % url)
2837
2838         if mobj.group('course') and mobj.group('video'): # A specific video
2839             course = mobj.group('course')
2840             video = mobj.group('video')
2841             info = {
2842                 'id': course + '_' + video,
2843                 'uploader': None,
2844                 'upload_date': None,
2845             }
2846
2847             self.report_extraction(info['id'])
2848             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2849             xmlUrl = baseUrl + video + '.xml'
2850             try:
2851                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2852             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2854                 return
2855             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2856             try:
2857                 info['title'] = mdoc.findall('./title')[0].text
2858                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2859             except IndexError:
2860                 self._downloader.report_error(u'Invalid metadata XML file')
2861                 return
2862             info['ext'] = info['url'].rpartition('.')[2]
2863             return [info]
2864         elif mobj.group('course'): # A course page
2865             course = mobj.group('course')
2866             info = {
2867                 'id': course,
2868                 'type': 'playlist',
2869                 'uploader': None,
2870                 'upload_date': None,
2871             }
2872
2873             coursepage = self._download_webpage(url, info['id'],
2874                                         note='Downloading course info page',
2875                                         errnote='Unable to download course info page')
2876
2877             m = re.search('<h1>([^<]+)</h1>', coursepage)
2878             if m:
2879                 info['title'] = unescapeHTML(m.group(1))
2880             else:
2881                 info['title'] = info['id']
2882
2883             m = re.search('<description>([^<]+)</description>', coursepage)
2884             if m:
2885                 info['description'] = unescapeHTML(m.group(1))
2886
2887             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2888             info['list'] = [
2889                 {
2890                     'type': 'reference',
2891                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2892                 }
2893                     for vpage in links]
2894             results = []
2895             for entry in info['list']:
2896                 assert entry['type'] == 'reference'
2897                 results += self.extract(entry['url'])
2898             return results
2899         else: # Root page
2900             info = {
2901                 'id': 'Stanford OpenClassroom',
2902                 'type': 'playlist',
2903                 'uploader': None,
2904                 'upload_date': None,
2905             }
2906
2907             self.report_download_webpage(info['id'])
2908             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2909             try:
2910                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2912                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2913                 return
2914
2915             info['title'] = info['id']
2916
2917             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2918             info['list'] = [
2919                 {
2920                     'type': 'reference',
2921                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2922                 }
2923                     for cpage in links]
2924
2925             results = []
2926             for entry in info['list']:
2927                 assert entry['type'] == 'reference'
2928                 results += self.extract(entry['url'])
2929             return results
2930
2931 class MTVIE(InfoExtractor):
2932     """Information extractor for MTV.com"""
2933
2934     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2935     IE_NAME = u'mtv'
2936
2937     def _real_extract(self, url):
2938         mobj = re.match(self._VALID_URL, url)
2939         if mobj is None:
2940             self._downloader.report_error(u'invalid URL: %s' % url)
2941             return
2942         if not mobj.group('proto'):
2943             url = 'http://' + url
2944         video_id = mobj.group('videoid')
2945
2946         webpage = self._download_webpage(url, video_id)
2947
2948         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2949         if mobj is None:
2950             self._downloader.report_error(u'unable to extract song name')
2951             return
2952         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2953         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2954         if mobj is None:
2955             self._downloader.report_error(u'unable to extract performer')
2956             return
2957         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2958         video_title = performer + ' - ' + song_name
2959
2960         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2961         if mobj is None:
2962             self._downloader.report_error(u'unable to mtvn_uri')
2963             return
2964         mtvn_uri = mobj.group(1)
2965
2966         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2967         if mobj is None:
2968             self._downloader.report_error(u'unable to extract content id')
2969             return
2970         content_id = mobj.group(1)
2971
2972         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2973         self.report_extraction(video_id)
2974         request = compat_urllib_request.Request(videogen_url)
2975         try:
2976             metadataXml = compat_urllib_request.urlopen(request).read()
2977         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2978             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2979             return
2980
2981         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2982         renditions = mdoc.findall('.//rendition')
2983
2984         # For now, always pick the highest quality.
2985         rendition = renditions[-1]
2986
2987         try:
2988             _,_,ext = rendition.attrib['type'].partition('/')
2989             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2990             video_url = rendition.find('./src').text
2991         except KeyError:
2992             self._downloader.report_error('Invalid rendition field.')
2993             return
2994
2995         info = {
2996             'id': video_id,
2997             'url': video_url,
2998             'uploader': performer,
2999             'upload_date': None,
3000             'title': video_title,
3001             'ext': ext,
3002             'format': format,
3003         }
3004
3005         return [info]
3006
3007
3008 class YoukuIE(InfoExtractor):
3009     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3010
3011     def _gen_sid(self):
3012         nowTime = int(time.time() * 1000)
3013         random1 = random.randint(1000,1998)
3014         random2 = random.randint(1000,9999)
3015
3016         return "%d%d%d" %(nowTime,random1,random2)
3017
3018     def _get_file_ID_mix_string(self, seed):
3019         mixed = []
3020         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3021         seed = float(seed)
3022         for i in range(len(source)):
3023             seed  =  (seed * 211 + 30031 ) % 65536
3024             index  =  math.floor(seed / 65536 * len(source) )
3025             mixed.append(source[int(index)])
3026             source.remove(source[int(index)])
3027         #return ''.join(mixed)
3028         return mixed
3029
3030     def _get_file_id(self, fileId, seed):
3031         mixed = self._get_file_ID_mix_string(seed)
3032         ids = fileId.split('*')
3033         realId = []
3034         for ch in ids:
3035             if ch:
3036                 realId.append(mixed[int(ch)])
3037         return ''.join(realId)
3038
3039     def _real_extract(self, url):
3040         mobj = re.match(self._VALID_URL, url)
3041         if mobj is None:
3042             self._downloader.report_error(u'invalid URL: %s' % url)
3043             return
3044         video_id = mobj.group('ID')
3045
3046         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3047
3048         jsondata = self._download_webpage(info_url, video_id)
3049
3050         self.report_extraction(video_id)
3051         try:
3052             config = json.loads(jsondata)
3053
3054             video_title =  config['data'][0]['title']
3055             seed = config['data'][0]['seed']
3056
3057             format = self._downloader.params.get('format', None)
3058             supported_format = list(config['data'][0]['streamfileids'].keys())
3059
3060             if format is None or format == 'best':
3061                 if 'hd2' in supported_format:
3062                     format = 'hd2'
3063                 else:
3064                     format = 'flv'
3065                 ext = u'flv'
3066             elif format == 'worst':
3067                 format = 'mp4'
3068                 ext = u'mp4'
3069             else:
3070                 format = 'flv'
3071                 ext = u'flv'
3072
3073
3074             fileid = config['data'][0]['streamfileids'][format]
3075             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3076         except (UnicodeDecodeError, ValueError, KeyError):
3077             self._downloader.report_error(u'unable to extract info section')
3078             return
3079
3080         files_info=[]
3081         sid = self._gen_sid()
3082         fileid = self._get_file_id(fileid, seed)
3083
3084         #column 8,9 of fileid represent the segment number
3085         #fileid[7:9] should be changed
3086         for index, key in enumerate(keys):
3087
3088             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3089             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3090
3091             info = {
3092                 'id': '%s_part%02d' % (video_id, index),
3093                 'url': download_url,
3094                 'uploader': None,
3095                 'upload_date': None,
3096                 'title': video_title,
3097                 'ext': ext,
3098             }
3099             files_info.append(info)
3100
3101         return files_info
3102
3103
3104 class XNXXIE(InfoExtractor):
3105     """Information extractor for xnxx.com"""
3106
3107     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3108     IE_NAME = u'xnxx'
3109     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3110     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3111     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3112
3113     def _real_extract(self, url):
3114         mobj = re.match(self._VALID_URL, url)
3115         if mobj is None:
3116             self._downloader.report_error(u'invalid URL: %s' % url)
3117             return
3118         video_id = mobj.group(1)
3119
3120         # Get webpage content
3121         webpage = self._download_webpage(url, video_id)
3122
3123         result = re.search(self.VIDEO_URL_RE, webpage)
3124         if result is None:
3125             self._downloader.report_error(u'unable to extract video url')
3126             return
3127         video_url = compat_urllib_parse.unquote(result.group(1))
3128
3129         result = re.search(self.VIDEO_TITLE_RE, webpage)
3130         if result is None:
3131             self._downloader.report_error(u'unable to extract video title')
3132             return
3133         video_title = result.group(1)
3134
3135         result = re.search(self.VIDEO_THUMB_RE, webpage)
3136         if result is None:
3137             self._downloader.report_error(u'unable to extract video thumbnail')
3138             return
3139         video_thumbnail = result.group(1)
3140
3141         return [{
3142             'id': video_id,
3143             'url': video_url,
3144             'uploader': None,
3145             'upload_date': None,
3146             'title': video_title,
3147             'ext': 'flv',
3148             'thumbnail': video_thumbnail,
3149             'description': None,
3150         }]
3151
3152
3153 class GooglePlusIE(InfoExtractor):
3154     """Information extractor for plus.google.com."""
3155
3156     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3157     IE_NAME = u'plus.google'
3158
3159     def report_extract_entry(self, url):
3160         """Report downloading extry"""
3161         self.to_screen(u'Downloading entry: %s' % url)
3162
3163     def report_date(self, upload_date):
3164         """Report downloading extry"""
3165         self.to_screen(u'Entry date: %s' % upload_date)
3166
3167     def report_uploader(self, uploader):
3168         """Report downloading extry"""
3169         self.to_screen(u'Uploader: %s' % uploader)
3170
3171     def report_title(self, video_title):
3172         """Report downloading extry"""
3173         self.to_screen(u'Title: %s' % video_title)
3174
3175     def report_extract_vid_page(self, video_page):
3176         """Report information extraction."""
3177         self.to_screen(u'Extracting video page: %s' % video_page)
3178
3179     def _real_extract(self, url):
3180         # Extract id from URL
3181         mobj = re.match(self._VALID_URL, url)
3182         if mobj is None:
3183             self._downloader.report_error(u'Invalid URL: %s' % url)
3184             return
3185
3186         post_url = mobj.group(0)
3187         video_id = mobj.group(1)
3188
3189         video_extension = 'flv'
3190
3191         # Step 1, Retrieve post webpage to extract further information
3192         self.report_extract_entry(post_url)
3193         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3194
3195         # Extract update date
3196         upload_date = None
3197         pattern = 'title="Timestamp">(.*?)</a>'
3198         mobj = re.search(pattern, webpage)
3199         if mobj:
3200             upload_date = mobj.group(1)
3201             # Convert timestring to a format suitable for filename
3202             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3203             upload_date = upload_date.strftime('%Y%m%d')
3204         self.report_date(upload_date)
3205
3206         # Extract uploader
3207         uploader = None
3208         pattern = r'rel\="author".*?>(.*?)</a>'
3209         mobj = re.search(pattern, webpage)
3210         if mobj:
3211             uploader = mobj.group(1)
3212         self.report_uploader(uploader)
3213
3214         # Extract title
3215         # Get the first line for title
3216         video_title = u'NA'
3217         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3218         mobj = re.search(pattern, webpage)
3219         if mobj:
3220             video_title = mobj.group(1)
3221         self.report_title(video_title)
3222
3223         # Step 2, Stimulate clicking the image box to launch video
3224         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3225         mobj = re.search(pattern, webpage)
3226         if mobj is None:
3227             self._downloader.report_error(u'unable to extract video page URL')
3228
3229         video_page = mobj.group(1)
3230         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3231         self.report_extract_vid_page(video_page)
3232
3233
3234         # Extract video links on video page
3235         """Extract video links of all sizes"""
3236         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3237         mobj = re.findall(pattern, webpage)
3238         if len(mobj) == 0:
3239             self._downloader.report_error(u'unable to extract video links')
3240
3241         # Sort in resolution
3242         links = sorted(mobj)
3243
3244         # Choose the lowest of the sort, i.e. highest resolution
3245         video_url = links[-1]
3246         # Only get the url. The resolution part in the tuple has no use anymore
3247         video_url = video_url[-1]
3248         # Treat escaped \u0026 style hex
3249         try:
3250             video_url = video_url.decode("unicode_escape")
3251         except AttributeError: # Python 3
3252             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3253
3254
3255         return [{
3256             'id':       video_id,
3257             'url':      video_url,
3258             'uploader': uploader,
3259             'upload_date':  upload_date,
3260             'title':    video_title,
3261             'ext':      video_extension,
3262         }]
3263
3264 class NBAIE(InfoExtractor):
3265     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3266     IE_NAME = u'nba'
3267
3268     def _real_extract(self, url):
3269         mobj = re.match(self._VALID_URL, url)
3270         if mobj is None:
3271             self._downloader.report_error(u'invalid URL: %s' % url)
3272             return
3273
3274         video_id = mobj.group(1)
3275         if video_id.endswith('/index.html'):
3276             video_id = video_id[:-len('/index.html')]
3277
3278         webpage = self._download_webpage(url, video_id)
3279
3280         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3281         def _findProp(rexp, default=None):
3282             m = re.search(rexp, webpage)
3283             if m:
3284                 return unescapeHTML(m.group(1))
3285             else:
3286                 return default
3287
3288         shortened_video_id = video_id.rpartition('/')[2]
3289         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3290         info = {
3291             'id': shortened_video_id,
3292             'url': video_url,
3293             'ext': 'mp4',
3294             'title': title,
3295             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3296             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3297         }
3298         return [info]
3299
3300 class JustinTVIE(InfoExtractor):
3301     """Information extractor for justin.tv and twitch.tv"""
3302     # TODO: One broadcast may be split into multiple videos. The key
3303     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3304     # starts at 1 and increases. Can we treat all parts as one video?
3305
3306     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3307         (?:
3308             (?P<channelid>[^/]+)|
3309             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3310             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3311         )
3312         /?(?:\#.*)?$
3313         """
3314     _JUSTIN_PAGE_LIMIT = 100
3315     IE_NAME = u'justin.tv'
3316
3317     def report_download_page(self, channel, offset):
3318         """Report attempt to download a single page of videos."""
3319         self.to_screen(u'%s: Downloading video information from %d to %d' %
3320                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3321
3322     # Return count of items, list of *valid* items
3323     def _parse_page(self, url, video_id):
3324         webpage = self._download_webpage(url, video_id,
3325                                          u'Downloading video info JSON',
3326                                          u'unable to download video info JSON')
3327
3328         response = json.loads(webpage)
3329         if type(response) != list:
3330             error_text = response.get('error', 'unknown error')
3331             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3332         info = []
3333         for clip in response:
3334             video_url = clip['video_file_url']
3335             if video_url:
3336                 video_extension = os.path.splitext(video_url)[1][1:]
3337                 video_date = re.sub('-', '', clip['start_time'][:10])
3338                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3339                 video_id = clip['id']
3340                 video_title = clip.get('title', video_id)
3341                 info.append({
3342                     'id': video_id,
3343                     'url': video_url,
3344                     'title': video_title,
3345                     'uploader': clip.get('channel_name', video_uploader_id),
3346                     'uploader_id': video_uploader_id,
3347                     'upload_date': video_date,
3348                     'ext': video_extension,
3349                 })
3350         return (len(response), info)
3351
3352     def _real_extract(self, url):
3353         mobj = re.match(self._VALID_URL, url)
3354         if mobj is None:
3355             raise ExtractorError(u'invalid URL: %s' % url)
3356
3357         api_base = 'http://api.justin.tv'
3358         paged = False
3359         if mobj.group('channelid'):
3360             paged = True
3361             video_id = mobj.group('channelid')
3362             api = api_base + '/channel/archives/%s.json' % video_id
3363         elif mobj.group('chapterid'):
3364             chapter_id = mobj.group('chapterid')
3365
3366             webpage = self._download_webpage(url, chapter_id)
3367             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3368             if not m:
3369                 raise ExtractorError(u'Cannot find archive of a chapter')
3370             archive_id = m.group(1)
3371             m = re.search(r"<h2 class='js-title'>([^<]*)</h2>", webpage)
3372             if not m:
3373                 raise ExtractorError(u'Cannot find chapter title')
3374             video_title = m.group(1)
3375
3376             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3377             chapter_info_xml = self._download_webpage(api, chapter_id,
3378                                              note=u'Downloading chapter information',
3379                                              errnote=u'Chapter information download failed')
3380             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3381             for a in doc.findall('.//archive'):
3382                 if archive_id == a.find('./id').text:
3383                     break
3384             else:
3385                 raise ExtractorError(u'Could not find chapter in chapter information')
3386
3387             video_url = a.find('./video_file_url').text
3388             video_ext = video_url.rpartition('.')[2] or u'flv'
3389
3390             # TODO determine start (and probably fix up file)
3391             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3392             #video_url += u'?start=' + a.find('./start_timestamp').text
3393             self._downloader.report_warning(u'Chapter detected, but we do not know how to calculate start position. Downloading the whole file ... (See https://github.com/rg3/youtube-dl/issues/810 )')
3394
3395             info = {
3396                 'id': u'c' + chapter_id,
3397                 'url': video_url,
3398                 'ext': video_ext,
3399                 'title': video_title,
3400             }
3401             return [info]
3402         else:
3403             video_id = mobj.group('videoid')
3404             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3405
3406         self.report_extraction(video_id)
3407
3408         info = []
3409         offset = 0
3410         limit = self._JUSTIN_PAGE_LIMIT
3411         while True:
3412             if paged:
3413                 self.report_download_page(video_id, offset)
3414             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3415             page_count, page_info = self._parse_page(page_url, video_id)
3416             info.extend(page_info)
3417             if not paged or page_count != limit:
3418                 break
3419             offset += limit
3420         return info
3421
3422 class FunnyOrDieIE(InfoExtractor):
3423     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3424
3425     def _real_extract(self, url):
3426         mobj = re.match(self._VALID_URL, url)
3427         if mobj is None:
3428             raise ExtractorError(u'invalid URL: %s' % url)
3429
3430         video_id = mobj.group('id')
3431         webpage = self._download_webpage(url, video_id)
3432
3433         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3434         if not m:
3435             self._downloader.report_error(u'unable to find video information')
3436         video_url = unescapeHTML(m.group('url'))
3437
3438         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3439         if not m:
3440             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3441             if not m:
3442                 self._downloader.report_error(u'Cannot find video title')
3443         title = clean_html(m.group('title'))
3444
3445         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3446         if m:
3447             desc = unescapeHTML(m.group('desc'))
3448         else:
3449             desc = None
3450
3451         info = {
3452             'id': video_id,
3453             'url': video_url,
3454             'ext': 'mp4',
3455             'title': title,
3456             'description': desc,
3457         }
3458         return [info]
3459
3460 class SteamIE(InfoExtractor):
3461     _VALID_URL = r"""http://store\.steampowered\.com/
3462                 (agecheck/)?
3463                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3464                 (?P<gameID>\d+)/?
3465                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3466                 """
3467
3468     @classmethod
3469     def suitable(cls, url):
3470         """Receives a URL and returns True if suitable for this IE."""
3471         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3472
3473     def _real_extract(self, url):
3474         m = re.match(self._VALID_URL, url, re.VERBOSE)
3475         gameID = m.group('gameID')
3476         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3477         self.report_age_confirmation()
3478         webpage = self._download_webpage(videourl, gameID)
3479         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3480         
3481         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3482         mweb = re.finditer(urlRE, webpage)
3483         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3484         titles = re.finditer(namesRE, webpage)
3485         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3486         thumbs = re.finditer(thumbsRE, webpage)
3487         videos = []
3488         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3489             video_id = vid.group('videoID')
3490             title = vtitle.group('videoName')
3491             video_url = vid.group('videoURL')
3492             video_thumb = thumb.group('thumbnail')
3493             if not video_url:
3494                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3495             info = {
3496                 'id':video_id,
3497                 'url':video_url,
3498                 'ext': 'flv',
3499                 'title': unescapeHTML(title),
3500                 'thumbnail': video_thumb
3501                   }
3502             videos.append(info)
3503         return [self.playlist_result(videos, gameID, game_title)]
3504
3505 class UstreamIE(InfoExtractor):
3506     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3507     IE_NAME = u'ustream'
3508
3509     def _real_extract(self, url):
3510         m = re.match(self._VALID_URL, url)
3511         video_id = m.group('videoID')
3512         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3513         webpage = self._download_webpage(url, video_id)
3514         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3515         title = m.group('title')
3516         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3517         uploader = m.group('uploader')
3518         info = {
3519                 'id':video_id,
3520                 'url':video_url,
3521                 'ext': 'flv',
3522                 'title': title,
3523                 'uploader': uploader
3524                   }
3525         return [info]
3526
3527 class WorldStarHipHopIE(InfoExtractor):
3528     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3529     IE_NAME = u'WorldStarHipHop'
3530
3531     def _real_extract(self, url):
3532         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3533
3534         m = re.match(self._VALID_URL, url)
3535         video_id = m.group('id')
3536
3537         webpage_src = self._download_webpage(url, video_id) 
3538
3539         mobj = re.search(_src_url, webpage_src)
3540
3541         if mobj is not None:
3542             video_url = mobj.group(1)
3543             if 'mp4' in video_url:
3544                 ext = 'mp4'
3545             else:
3546                 ext = 'flv'
3547         else:
3548             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3549
3550         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3551
3552         if mobj is None:
3553             raise ExtractorError(u'Cannot determine title')
3554         title = mobj.group(1)
3555
3556         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3557         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3558         if mobj is not None:
3559             thumbnail = mobj.group(1)
3560         else:
3561             _title = r"""candytitles.*>(.*)</span>"""
3562             mobj = re.search(_title, webpage_src)
3563             if mobj is not None:
3564                 title = mobj.group(1)
3565             thumbnail = None
3566
3567         results = [{
3568                     'id': video_id,
3569                     'url' : video_url,
3570                     'title' : title,
3571                     'thumbnail' : thumbnail,
3572                     'ext' : ext,
3573                     }]
3574         return results
3575
3576 class RBMARadioIE(InfoExtractor):
3577     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3578
3579     def _real_extract(self, url):
3580         m = re.match(self._VALID_URL, url)
3581         video_id = m.group('videoID')
3582
3583         webpage = self._download_webpage(url, video_id)
3584         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3585         if not m:
3586             raise ExtractorError(u'Cannot find metadata')
3587         json_data = m.group(1)
3588
3589         try:
3590             data = json.loads(json_data)
3591         except ValueError as e:
3592             raise ExtractorError(u'Invalid JSON: ' + str(e))
3593
3594         video_url = data['akamai_url'] + '&cbr=256'
3595         url_parts = compat_urllib_parse_urlparse(video_url)
3596         video_ext = url_parts.path.rpartition('.')[2]
3597         info = {
3598                 'id': video_id,
3599                 'url': video_url,
3600                 'ext': video_ext,
3601                 'title': data['title'],
3602                 'description': data.get('teaser_text'),
3603                 'location': data.get('country_of_origin'),
3604                 'uploader': data.get('host', {}).get('name'),
3605                 'uploader_id': data.get('host', {}).get('slug'),
3606                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3607                 'duration': data.get('duration'),
3608         }
3609         return [info]
3610
3611
3612 class YouPornIE(InfoExtractor):
3613     """Information extractor for youporn.com."""
3614     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3615
3616     def _print_formats(self, formats):
3617         """Print all available formats"""
3618         print(u'Available formats:')
3619         print(u'ext\t\tformat')
3620         print(u'---------------------------------')
3621         for format in formats:
3622             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3623
3624     def _specific(self, req_format, formats):
3625         for x in formats:
3626             if(x["format"]==req_format):
3627                 return x
3628         return None
3629
3630     def _real_extract(self, url):
3631         mobj = re.match(self._VALID_URL, url)
3632         if mobj is None:
3633             self._downloader.report_error(u'invalid URL: %s' % url)
3634             return
3635
3636         video_id = mobj.group('videoid')
3637
3638         req = compat_urllib_request.Request(url)
3639         req.add_header('Cookie', 'age_verified=1')
3640         webpage = self._download_webpage(req, video_id)
3641
3642         # Get the video title
3643         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3644         if result is None:
3645             raise ExtractorError(u'Unable to extract video title')
3646         video_title = result.group('title').strip()
3647
3648         # Get the video date
3649         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3650         if result is None:
3651             self._downloader.report_warning(u'unable to extract video date')
3652             upload_date = None
3653         else:
3654             upload_date = unified_strdate(result.group('date').strip())
3655
3656         # Get the video uploader
3657         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3658         if result is None:
3659             self._downloader.report_warning(u'unable to extract uploader')
3660             video_uploader = None
3661         else:
3662             video_uploader = result.group('uploader').strip()
3663             video_uploader = clean_html( video_uploader )
3664
3665         # Get all of the formats available
3666         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3667         result = re.search(DOWNLOAD_LIST_RE, webpage)
3668         if result is None:
3669             raise ExtractorError(u'Unable to extract download list')
3670         download_list_html = result.group('download_list').strip()
3671
3672         # Get all of the links from the page
3673         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3674         links = re.findall(LINK_RE, download_list_html)
3675         if(len(links) == 0):
3676             raise ExtractorError(u'ERROR: no known formats available for video')
3677
3678         self.to_screen(u'Links found: %d' % len(links))
3679
3680         formats = []
3681         for link in links:
3682
3683             # A link looks like this:
3684             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3685             # A path looks like this:
3686             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3687             video_url = unescapeHTML( link )
3688             path = compat_urllib_parse_urlparse( video_url ).path
3689             extension = os.path.splitext( path )[1][1:]
3690             format = path.split('/')[4].split('_')[:2]
3691             size = format[0]
3692             bitrate = format[1]
3693             format = "-".join( format )
3694             title = u'%s-%s-%s' % (video_title, size, bitrate)
3695
3696             formats.append({
3697                 'id': video_id,
3698                 'url': video_url,
3699                 'uploader': video_uploader,
3700                 'upload_date': upload_date,
3701                 'title': title,
3702                 'ext': extension,
3703                 'format': format,
3704                 'thumbnail': None,
3705                 'description': None,
3706                 'player_url': None
3707             })
3708
3709         if self._downloader.params.get('listformats', None):
3710             self._print_formats(formats)
3711             return
3712
3713         req_format = self._downloader.params.get('format', None)
3714         self.to_screen(u'Format: %s' % req_format)
3715
3716         if req_format is None or req_format == 'best':
3717             return [formats[0]]
3718         elif req_format == 'worst':
3719             return [formats[-1]]
3720         elif req_format in ('-1', 'all'):
3721             return formats
3722         else:
3723             format = self._specific( req_format, formats )
3724             if result is None:
3725                 self._downloader.report_error(u'requested format not available')
3726                 return
3727             return [format]
3728
3729
3730
3731 class PornotubeIE(InfoExtractor):
3732     """Information extractor for pornotube.com."""
3733     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3734
3735     def _real_extract(self, url):
3736         mobj = re.match(self._VALID_URL, url)
3737         if mobj is None:
3738             self._downloader.report_error(u'invalid URL: %s' % url)
3739             return
3740
3741         video_id = mobj.group('videoid')
3742         video_title = mobj.group('title')
3743
3744         # Get webpage content
3745         webpage = self._download_webpage(url, video_id)
3746
3747         # Get the video URL
3748         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3749         result = re.search(VIDEO_URL_RE, webpage)
3750         if result is None:
3751             self._downloader.report_error(u'unable to extract video url')
3752             return
3753         video_url = compat_urllib_parse.unquote(result.group('url'))
3754
3755         #Get the uploaded date
3756         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3757         result = re.search(VIDEO_UPLOADED_RE, webpage)
3758         if result is None:
3759             self._downloader.report_error(u'unable to extract video title')
3760             return
3761         upload_date = unified_strdate(result.group('date'))
3762
3763         info = {'id': video_id,
3764                 'url': video_url,
3765                 'uploader': None,
3766                 'upload_date': upload_date,
3767                 'title': video_title,
3768                 'ext': 'flv',
3769                 'format': 'flv'}
3770
3771         return [info]
3772
3773 class YouJizzIE(InfoExtractor):
3774     """Information extractor for youjizz.com."""
3775     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3776
3777     def _real_extract(self, url):
3778         mobj = re.match(self._VALID_URL, url)
3779         if mobj is None:
3780             self._downloader.report_error(u'invalid URL: %s' % url)
3781             return
3782
3783         video_id = mobj.group('videoid')
3784
3785         # Get webpage content
3786         webpage = self._download_webpage(url, video_id)
3787
3788         # Get the video title
3789         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3790         if result is None:
3791             raise ExtractorError(u'ERROR: unable to extract video title')
3792         video_title = result.group('title').strip()
3793
3794         # Get the embed page
3795         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3796         if result is None:
3797             raise ExtractorError(u'ERROR: unable to extract embed page')
3798
3799         embed_page_url = result.group(0).strip()
3800         video_id = result.group('videoid')
3801
3802         webpage = self._download_webpage(embed_page_url, video_id)
3803
3804         # Get the video URL
3805         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3806         if result is None:
3807             raise ExtractorError(u'ERROR: unable to extract video url')
3808         video_url = result.group('source')
3809
3810         info = {'id': video_id,
3811                 'url': video_url,
3812                 'title': video_title,
3813                 'ext': 'flv',
3814                 'format': 'flv',
3815                 'player_url': embed_page_url}
3816
3817         return [info]
3818
3819 class EightTracksIE(InfoExtractor):
3820     IE_NAME = '8tracks'
3821     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3822
3823     def _real_extract(self, url):
3824         mobj = re.match(self._VALID_URL, url)
3825         if mobj is None:
3826             raise ExtractorError(u'Invalid URL: %s' % url)
3827         playlist_id = mobj.group('id')
3828
3829         webpage = self._download_webpage(url, playlist_id)
3830
3831         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3832         if not m:
3833             raise ExtractorError(u'Cannot find trax information')
3834         json_like = m.group(1)
3835         data = json.loads(json_like)
3836
3837         session = str(random.randint(0, 1000000000))
3838         mix_id = data['id']
3839         track_count = data['tracks_count']
3840         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3841         next_url = first_url
3842         res = []
3843         for i in itertools.count():
3844             api_json = self._download_webpage(next_url, playlist_id,
3845                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3846                 errnote=u'Failed to download song information')
3847             api_data = json.loads(api_json)
3848             track_data = api_data[u'set']['track']
3849             info = {
3850                 'id': track_data['id'],
3851                 'url': track_data['track_file_stream_url'],
3852                 'title': track_data['performer'] + u' - ' + track_data['name'],
3853                 'raw_title': track_data['name'],
3854                 'uploader_id': data['user']['login'],
3855                 'ext': 'm4a',
3856             }
3857             res.append(info)
3858             if api_data['set']['at_last_track']:
3859                 break
3860             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3861         return res
3862
3863 class KeekIE(InfoExtractor):
3864     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3865     IE_NAME = u'keek'
3866
3867     def _real_extract(self, url):
3868         m = re.match(self._VALID_URL, url)
3869         video_id = m.group('videoID')
3870         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3871         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3872         webpage = self._download_webpage(url, video_id)
3873         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3874         title = unescapeHTML(m.group('title'))
3875         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3876         uploader = clean_html(m.group('uploader'))
3877         info = {
3878                 'id': video_id,
3879                 'url': video_url,
3880                 'ext': 'mp4',
3881                 'title': title,
3882                 'thumbnail': thumbnail,
3883                 'uploader': uploader
3884         }
3885         return [info]
3886
3887 class TEDIE(InfoExtractor):
3888     _VALID_URL=r'''http://www\.ted\.com/
3889                    (
3890                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3891                         |
3892                         ((?P<type_talk>talks)) # We have a simple talk
3893                    )
3894                    (/lang/(.*?))? # The url may contain the language
3895                    /(?P<name>\w+) # Here goes the name and then ".html"
3896                    '''
3897
3898     @classmethod
3899     def suitable(cls, url):
3900         """Receives a URL and returns True if suitable for this IE."""
3901         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3902
3903     def _real_extract(self, url):
3904         m=re.match(self._VALID_URL, url, re.VERBOSE)
3905         if m.group('type_talk'):
3906             return [self._talk_info(url)]
3907         else :
3908             playlist_id=m.group('playlist_id')
3909             name=m.group('name')
3910             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3911             return [self._playlist_videos_info(url,name,playlist_id)]
3912
3913     def _talk_video_link(self,mediaSlug):
3914         '''Returns the video link for that mediaSlug'''
3915         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3916
3917     def _playlist_videos_info(self,url,name,playlist_id=0):
3918         '''Returns the videos of the playlist'''
3919         video_RE=r'''
3920                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3921                      ([.\s]*?)data-playlist_item_id="(\d+)"
3922                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3923                      '''
3924         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3925         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3926         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3927         m_names=re.finditer(video_name_RE,webpage)
3928
3929         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3930         m_playlist = re.search(playlist_RE, webpage)
3931         playlist_title = m_playlist.group('playlist_title')
3932
3933         playlist_entries = []
3934         for m_video, m_name in zip(m_videos,m_names):
3935             video_id=m_video.group('video_id')
3936             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3937             playlist_entries.append(self.url_result(talk_url, 'TED'))
3938         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3939
3940     def _talk_info(self, url, video_id=0):
3941         """Return the video for the talk in the url"""
3942         m=re.match(self._VALID_URL, url,re.VERBOSE)
3943         videoName=m.group('name')
3944         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3945         # If the url includes the language we get the title translated
3946         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3947         title=re.search(title_RE, webpage).group('title')
3948         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3949                         "id":(?P<videoID>[\d]+).*?
3950                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3951         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3952         thumb_match=re.search(thumb_RE,webpage)
3953         info_match=re.search(info_RE,webpage,re.VERBOSE)
3954         video_id=info_match.group('videoID')
3955         mediaSlug=info_match.group('mediaSlug')
3956         video_url=self._talk_video_link(mediaSlug)
3957         info = {
3958                 'id': video_id,
3959                 'url': video_url,
3960                 'ext': 'mp4',
3961                 'title': title,
3962                 'thumbnail': thumb_match.group('thumbnail')
3963                 }
3964         return info
3965
3966 class MySpassIE(InfoExtractor):
3967     _VALID_URL = r'http://www.myspass.de/.*'
3968
3969     def _real_extract(self, url):
3970         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3971
3972         # video id is the last path element of the URL
3973         # usually there is a trailing slash, so also try the second but last
3974         url_path = compat_urllib_parse_urlparse(url).path
3975         url_parent_path, video_id = os.path.split(url_path)
3976         if not video_id:
3977             _, video_id = os.path.split(url_parent_path)
3978
3979         # get metadata
3980         metadata_url = META_DATA_URL_TEMPLATE % video_id
3981         metadata_text = self._download_webpage(metadata_url, video_id)
3982         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3983
3984         # extract values from metadata
3985         url_flv_el = metadata.find('url_flv')
3986         if url_flv_el is None:
3987             self._downloader.report_error(u'unable to extract download url')
3988             return
3989         video_url = url_flv_el.text
3990         extension = os.path.splitext(video_url)[1][1:]
3991         title_el = metadata.find('title')
3992         if title_el is None:
3993             self._downloader.report_error(u'unable to extract title')
3994             return
3995         title = title_el.text
3996         format_id_el = metadata.find('format_id')
3997         if format_id_el is None:
3998             format = ext
3999         else:
4000             format = format_id_el.text
4001         description_el = metadata.find('description')
4002         if description_el is not None:
4003             description = description_el.text
4004         else:
4005             description = None
4006         imagePreview_el = metadata.find('imagePreview')
4007         if imagePreview_el is not None:
4008             thumbnail = imagePreview_el.text
4009         else:
4010             thumbnail = None
4011         info = {
4012             'id': video_id,
4013             'url': video_url,
4014             'title': title,
4015             'ext': extension,
4016             'format': format,
4017             'thumbnail': thumbnail,
4018             'description': description
4019         }
4020         return [info]
4021
4022 class SpiegelIE(InfoExtractor):
4023     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4024
4025     def _real_extract(self, url):
4026         m = re.match(self._VALID_URL, url)
4027         video_id = m.group('videoID')
4028
4029         webpage = self._download_webpage(url, video_id)
4030         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4031         if not m:
4032             raise ExtractorError(u'Cannot find title')
4033         video_title = unescapeHTML(m.group(1))
4034
4035         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4036         xml_code = self._download_webpage(xml_url, video_id,
4037                     note=u'Downloading XML', errnote=u'Failed to download XML')
4038
4039         idoc = xml.etree.ElementTree.fromstring(xml_code)
4040         last_type = idoc[-1]
4041         filename = last_type.findall('./filename')[0].text
4042         duration = float(last_type.findall('./duration')[0].text)
4043
4044         video_url = 'http://video2.spiegel.de/flash/' + filename
4045         video_ext = filename.rpartition('.')[2]
4046         info = {
4047             'id': video_id,
4048             'url': video_url,
4049             'ext': video_ext,
4050             'title': video_title,
4051             'duration': duration,
4052         }
4053         return [info]
4054
4055 class LiveLeakIE(InfoExtractor):
4056
4057     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4058     IE_NAME = u'liveleak'
4059
4060     def _real_extract(self, url):
4061         mobj = re.match(self._VALID_URL, url)
4062         if mobj is None:
4063             self._downloader.report_error(u'invalid URL: %s' % url)
4064             return
4065
4066         video_id = mobj.group('video_id')
4067
4068         webpage = self._download_webpage(url, video_id)
4069
4070         m = re.search(r'file: "(.*?)",', webpage)
4071         if not m:
4072             self._downloader.report_error(u'unable to find video url')
4073             return
4074         video_url = m.group(1)
4075
4076         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4077         if not m:
4078             self._downloader.report_error(u'Cannot find video title')
4079         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4080
4081         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4082         if m:
4083             desc = unescapeHTML(m.group('desc'))
4084         else:
4085             desc = None
4086
4087         m = re.search(r'By:.*?(\w+)</a>', webpage)
4088         if m:
4089             uploader = clean_html(m.group(1))
4090         else:
4091             uploader = None
4092
4093         info = {
4094             'id':  video_id,
4095             'url': video_url,
4096             'ext': 'mp4',
4097             'title': title,
4098             'description': desc,
4099             'uploader': uploader
4100         }
4101
4102         return [info]
4103
4104 class ARDIE(InfoExtractor):
4105     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4106     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4107     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4108
4109     def _real_extract(self, url):
4110         # determine video id from url
4111         m = re.match(self._VALID_URL, url)
4112
4113         numid = re.search(r'documentId=([0-9]+)', url)
4114         if numid:
4115             video_id = numid.group(1)
4116         else:
4117             video_id = m.group('video_id')
4118
4119         # determine title and media streams from webpage
4120         html = self._download_webpage(url, video_id)
4121         title = re.search(self._TITLE, html).group('title')
4122         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4123         if not streams:
4124             assert '"fsk"' in html
4125             self._downloader.report_error(u'this video is only available after 8:00 pm')
4126             return
4127
4128         # choose default media type and highest quality for now
4129         stream = max([s for s in streams if int(s["media_type"]) == 0],
4130                      key=lambda s: int(s["quality"]))
4131
4132         # there's two possibilities: RTMP stream or HTTP download
4133         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4134         if stream['rtmp_url']:
4135             self.to_screen(u'RTMP download detected')
4136             assert stream['video_url'].startswith('mp4:')
4137             info["url"] = stream["rtmp_url"]
4138             info["play_path"] = stream['video_url']
4139         else:
4140             assert stream["video_url"].endswith('.mp4')
4141             info["url"] = stream["video_url"]
4142         return [info]
4143
4144 class TumblrIE(InfoExtractor):
4145     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4146
4147     def _real_extract(self, url):
4148         m_url = re.match(self._VALID_URL, url)
4149         video_id = m_url.group('id')
4150         blog = m_url.group('blog_name')
4151
4152         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4153         webpage = self._download_webpage(url, video_id)
4154
4155         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4156         video = re.search(re_video, webpage)
4157         if video is None:
4158             self.to_screen("No video founded")
4159             return []
4160         video_url = video.group('video_url')
4161         ext = video.group('ext')
4162
4163         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4164         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4165
4166         # The only place where you can get a title, it's not complete,
4167         # but searching in other places doesn't work for all videos
4168         re_title = r'<title>(?P<title>.*?)</title>'
4169         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4170
4171         return [{'id': video_id,
4172                  'url': video_url,
4173                  'title': title,
4174                  'thumbnail': thumb,
4175                  'ext': ext
4176                  }]
4177
4178 class BandcampIE(InfoExtractor):
4179     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4180
4181     def _real_extract(self, url):
4182         mobj = re.match(self._VALID_URL, url)
4183         title = mobj.group('title')
4184         webpage = self._download_webpage(url, title)
4185         # We get the link to the free download page
4186         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4187         if m_download is None:
4188             self._downloader.report_error('No free songs founded')
4189             return
4190         download_link = m_download.group(1)
4191         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
4192                        webpage, re.MULTILINE|re.DOTALL).group('id')
4193
4194         download_webpage = self._download_webpage(download_link, id,
4195                                                   'Downloading free downloads page')
4196         # We get the dictionary of the track from some javascrip code
4197         info = re.search(r'items: (.*?),$',
4198                          download_webpage, re.MULTILINE).group(1)
4199         info = json.loads(info)[0]
4200         # We pick mp3-320 for now, until format selection can be easily implemented.
4201         mp3_info = info[u'downloads'][u'mp3-320']
4202         # If we try to use this url it says the link has expired
4203         initial_url = mp3_info[u'url']
4204         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4205         m_url = re.match(re_url, initial_url)
4206         #We build the url we will use to get the final track url
4207         # This url is build in Bandcamp in the script download_bunde_*.js
4208         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4209         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4210         # If we could correctly generate the .rand field the url would be
4211         #in the "download_url" key
4212         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4213
4214         track_info = {'id':id,
4215                       'title' : info[u'title'],
4216                       'ext' : 'mp3',
4217                       'url' : final_url,
4218                       'thumbnail' : info[u'thumb_url'],
4219                       'uploader' : info[u'artist']
4220                       }
4221
4222         return [track_info]
4223
4224 class RedTubeIE(InfoExtractor):
4225     """Information Extractor for redtube"""
4226     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4227
4228     def _real_extract(self,url):
4229         mobj = re.match(self._VALID_URL, url)
4230         if mobj is None:
4231             raise ExtractorError(u'Invalid URL: %s' % url)
4232
4233         video_id = mobj.group('id')
4234         video_extension = 'mp4'        
4235         webpage = self._download_webpage(url, video_id)
4236         self.report_extraction(video_id)
4237         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4238
4239         if mobj is None:
4240             raise ExtractorError(u'Unable to extract media URL')
4241
4242         video_url = mobj.group(1)
4243         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4244         if mobj is None:
4245             raise ExtractorError(u'Unable to extract title')
4246         video_title = mobj.group(1)
4247
4248         return [{
4249             'id':       video_id,
4250             'url':      video_url,
4251             'ext':      video_extension,
4252             'title':    video_title,
4253         }]
4254
4255
4256 def gen_extractors():
4257     """ Return a list of an instance of every supported extractor.
4258     The order does matter; the first extractor matched is the one handling the URL.
4259     """
4260     return [
4261         YoutubePlaylistIE(),
4262         YoutubeChannelIE(),
4263         YoutubeUserIE(),
4264         YoutubeSearchIE(),
4265         YoutubeIE(),
4266         MetacafeIE(),
4267         DailymotionIE(),
4268         GoogleSearchIE(),
4269         PhotobucketIE(),
4270         YahooIE(),
4271         YahooSearchIE(),
4272         DepositFilesIE(),
4273         FacebookIE(),
4274         BlipTVUserIE(),
4275         BlipTVIE(),
4276         VimeoIE(),
4277         MyVideoIE(),
4278         ComedyCentralIE(),
4279         EscapistIE(),
4280         CollegeHumorIE(),
4281         XVideosIE(),
4282         SoundcloudSetIE(),
4283         SoundcloudIE(),
4284         InfoQIE(),
4285         MixcloudIE(),
4286         StanfordOpenClassroomIE(),
4287         MTVIE(),
4288         YoukuIE(),
4289         XNXXIE(),
4290         YouJizzIE(),
4291         PornotubeIE(),
4292         YouPornIE(),
4293         GooglePlusIE(),
4294         ArteTvIE(),
4295         NBAIE(),
4296         WorldStarHipHopIE(),
4297         JustinTVIE(),
4298         FunnyOrDieIE(),
4299         SteamIE(),
4300         UstreamIE(),
4301         RBMARadioIE(),
4302         EightTracksIE(),
4303         KeekIE(),
4304         TEDIE(),
4305         MySpassIE(),
4306         SpiegelIE(),
4307         LiveLeakIE(),
4308         ARDIE(),
4309         TumblrIE(),
4310         BandcampIE(),
4311         RedTubeIE(),
4312         GenericIE()
4313     ]
4314
4315 def get_info_extractor(ie_name):
4316     """Returns the info extractor class with the given ie_name"""
4317     return globals()[ie_name+'IE']