MyVideoIE: add rtmp support
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24
25
26 class InfoExtractor(object):
27     """Information Extractor class.
28
29     Information extractors are the classes that, given a URL, extract
30     information about the video (or videos) the URL refers to. This
31     information includes the real video URL, the video title, author and
32     others. The information is stored in a dictionary which is then
33     passed to the FileDownloader. The FileDownloader processes this
34     information possibly downloading the video to the file system, among
35     other possible outcomes.
36
37     The dictionaries must include the following fields:
38
39     id:             Video identifier.
40     url:            Final video URL.
41     title:          Video title, unescaped.
42     ext:            Video filename extension.
43
44     The following fields are optional:
45
46     format:         The video format, defaults to ext (used for --get-format)
47     thumbnail:      Full URL to a video thumbnail image.
48     description:    One-line video description.
49     uploader:       Full name of the video uploader.
50     upload_date:    Video upload date (YYYYMMDD).
51     uploader_id:    Nickname or id of the video uploader.
52     location:       Physical location of the video.
53     player_url:     SWF Player URL (used for rtmpdump).
54     subtitles:      The subtitle file contents.
55     urlhandle:      [internal] The urlHandle to be used to download the file,
56                     like returned by urllib.request.urlopen
57
58     The fields should all be Unicode strings.
59
60     Subclasses of this one should re-define the _real_initialize() and
61     _real_extract() methods and define a _VALID_URL regexp.
62     Probably, they should also be added to the list of extractors.
63
64     _real_extract() must return a *list* of information dictionaries as
65     described above.
66
67     Finally, the _WORKING attribute should be set to False for broken IEs
68     in order to warn the users and skip the tests.
69     """
70
71     _ready = False
72     _downloader = None
73     _WORKING = True
74
75     def __init__(self, downloader=None):
76         """Constructor. Receives an optional downloader."""
77         self._ready = False
78         self.set_downloader(downloader)
79
80     @classmethod
81     def suitable(cls, url):
82         """Receives a URL and returns True if suitable for this IE."""
83         return re.match(cls._VALID_URL, url) is not None
84
85     @classmethod
86     def working(cls):
87         """Getter method for _WORKING."""
88         return cls._WORKING
89
90     def initialize(self):
91         """Initializes an instance (authentication, etc)."""
92         if not self._ready:
93             self._real_initialize()
94             self._ready = True
95
96     def extract(self, url):
97         """Extracts URL information and returns it in list of dicts."""
98         self.initialize()
99         return self._real_extract(url)
100
101     def set_downloader(self, downloader):
102         """Sets the downloader for this IE."""
103         self._downloader = downloader
104
105     def _real_initialize(self):
106         """Real initialization process. Redefine in subclasses."""
107         pass
108
109     def _real_extract(self, url):
110         """Real extraction process. Redefine in subclasses."""
111         pass
112
113     @property
114     def IE_NAME(self):
115         return type(self).__name__[:-2]
116
117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118         """ Returns the response handle """
119         if note is None:
120             self.report_download_webpage(video_id)
121         elif note is not False:
122             self.to_screen(u'%s: %s' % (video_id, note))
123         try:
124             return compat_urllib_request.urlopen(url_or_request)
125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
126             if errnote is None:
127                 errnote = u'Unable to download webpage'
128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
129
130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131         """ Returns a tuple (page content as string, URL handle) """
132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133         content_type = urlh.headers.get('Content-Type', '')
134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
135         if m:
136             encoding = m.group(1)
137         else:
138             encoding = 'utf-8'
139         webpage_bytes = urlh.read()
140         if self._downloader.params.get('dump_intermediate_pages', False):
141             try:
142                 url = url_or_request.get_full_url()
143             except AttributeError:
144                 url = url_or_request
145             self.to_screen(u'Dumping request to ' + url)
146             dump = base64.b64encode(webpage_bytes).decode('ascii')
147             self._downloader.to_screen(dump)
148         content = webpage_bytes.decode(encoding, 'replace')
149         return (content, urlh)
150
151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152         """ Returns the data of the page as a string """
153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
154
155     def to_screen(self, msg):
156         """Print msg to screen, prefixing it with '[ie_name]'"""
157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
158
159     def report_extraction(self, id_or_name):
160         """Report information extraction."""
161         self.to_screen(u'%s: Extracting information' % id_or_name)
162
163     def report_download_webpage(self, video_id):
164         """Report webpage download."""
165         self.to_screen(u'%s: Downloading webpage' % video_id)
166
167     def report_age_confirmation(self):
168         """Report attempt to confirm age."""
169         self.to_screen(u'Confirming age')
170
171     #Methods for following #608
172     #They set the correct value of the '_type' key
173     def video_result(self, video_info):
174         """Returns a video"""
175         video_info['_type'] = 'video'
176         return video_info
177     def url_result(self, url, ie=None):
178         """Returns a url that points to a page that should be processed"""
179         #TODO: ie should be the class used for getting the info
180         video_info = {'_type': 'url',
181                       'url': url,
182                       'ie_key': ie}
183         return video_info
184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185         """Returns a playlist"""
186         video_info = {'_type': 'playlist',
187                       'entries': entries}
188         if playlist_id:
189             video_info['id'] = playlist_id
190         if playlist_title:
191             video_info['title'] = playlist_title
192         return video_info
193
194 class SearchInfoExtractor(InfoExtractor):
195     """
196     Base class for paged search queries extractors.
197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
199     """
200
201     @classmethod
202     def _make_valid_url(cls):
203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
204
205     @classmethod
206     def suitable(cls, url):
207         return re.match(cls._make_valid_url(), url) is not None
208
209     def _real_extract(self, query):
210         mobj = re.match(self._make_valid_url(), query)
211         if mobj is None:
212             raise ExtractorError(u'Invalid search query "%s"' % query)
213
214         prefix = mobj.group('prefix')
215         query = mobj.group('query')
216         if prefix == '':
217             return self._get_n_results(query, 1)
218         elif prefix == 'all':
219             return self._get_n_results(query, self._MAX_RESULTS)
220         else:
221             n = int(prefix)
222             if n <= 0:
223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224             elif n > self._MAX_RESULTS:
225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226                 n = self._MAX_RESULTS
227             return self._get_n_results(query, n)
228
229     def _get_n_results(self, query, n):
230         """Get a specified number of results for a query"""
231         raise NotImplementedError("This method must be implemented by sublclasses")
232
233
234 class YoutubeIE(InfoExtractor):
235     """Information extractor for youtube.com."""
236
237     _VALID_URL = r"""^
238                      (
239                          (?:https?://)?                                       # http(s):// (optional)
240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
243                          (?:                                                  # the various things that can precede the ID:
244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
245                              |(?:                                             # or the v= param in all its forms
246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
249                                  v=
250                              )
251                          )?                                                   # optional -> youtube.com/xxxx is OK
252                      )?                                                       # all until now is optional -> you can pass the naked ID
253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
254                      (?(1).+)?                                                # if we found the ID, everything can follow
255                      $"""
256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260     _NETRC_MACHINE = 'youtube'
261     # Listed in order of quality
262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264     _video_extensions = {
265         '13': '3gp',
266         '17': 'mp4',
267         '18': 'mp4',
268         '22': 'mp4',
269         '37': 'mp4',
270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
271         '43': 'webm',
272         '44': 'webm',
273         '45': 'webm',
274         '46': 'webm',
275     }
276     _video_dimensions = {
277         '5': '240x400',
278         '6': '???',
279         '13': '???',
280         '17': '144x176',
281         '18': '360x640',
282         '22': '720x1280',
283         '34': '360x640',
284         '35': '480x854',
285         '37': '1080x1920',
286         '38': '3072x4096',
287         '43': '360x640',
288         '44': '480x854',
289         '45': '720x1280',
290         '46': '1080x1920',
291     }
292     IE_NAME = u'youtube'
293
294     @classmethod
295     def suitable(cls, url):
296         """Receives a URL and returns True if suitable for this IE."""
297         if YoutubePlaylistIE.suitable(url): return False
298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
299
300     def report_lang(self):
301         """Report attempt to set language."""
302         self.to_screen(u'Setting language')
303
304     def report_login(self):
305         """Report attempt to log in."""
306         self.to_screen(u'Logging in')
307
308     def report_video_webpage_download(self, video_id):
309         """Report attempt to download video webpage."""
310         self.to_screen(u'%s: Downloading video webpage' % video_id)
311
312     def report_video_info_webpage_download(self, video_id):
313         """Report attempt to download video info webpage."""
314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
315
316     def report_video_subtitles_download(self, video_id):
317         """Report attempt to download video info webpage."""
318         self.to_screen(u'%s: Checking available subtitles' % video_id)
319
320     def report_video_subtitles_request(self, video_id, sub_lang, format):
321         """Report attempt to download video info webpage."""
322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
323
324     def report_video_subtitles_available(self, video_id, sub_lang_list):
325         """Report available subtitles."""
326         sub_lang = ",".join(list(sub_lang_list.keys()))
327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
328
329     def report_information_extraction(self, video_id):
330         """Report attempt to extract video information."""
331         self.to_screen(u'%s: Extracting video information' % video_id)
332
333     def report_unavailable_format(self, video_id, format):
334         """Report extracted video URL."""
335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
336
337     def report_rtmp_download(self):
338         """Indicate the download will use the RTMP protocol."""
339         self.to_screen(u'RTMP download detected')
340
341     def _get_available_subtitles(self, video_id):
342         self.report_video_subtitles_download(video_id)
343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
344         try:
345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350         if not sub_lang_list:
351             return (u'video doesn\'t have subtitles', None)
352         return sub_lang_list
353
354     def _list_available_subtitles(self, video_id):
355         sub_lang_list = self._get_available_subtitles(video_id)
356         self.report_video_subtitles_available(video_id, sub_lang_list)
357
358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
359         """
360         Return tuple:
361         (error_message, sub_lang, sub)
362         """
363         self.report_video_subtitles_request(video_id, sub_lang, format)
364         params = compat_urllib_parse.urlencode({
365             'lang': sub_lang,
366             'name': sub_name,
367             'v': video_id,
368             'fmt': format,
369         })
370         url = 'http://www.youtube.com/api/timedtext?' + params
371         try:
372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
375         if not sub:
376             return (u'Did not fetch video subtitles', None, None)
377         return (None, sub_lang, sub)
378
379     def _extract_subtitle(self, video_id):
380         """
381         Return a list with a tuple:
382         [(error_message, sub_lang, sub)]
383         """
384         sub_lang_list = self._get_available_subtitles(video_id)
385         sub_format = self._downloader.params.get('subtitlesformat')
386         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
387             return [(sub_lang_list[0], None, None)]
388         if self._downloader.params.get('subtitleslang', False):
389             sub_lang = self._downloader.params.get('subtitleslang')
390         elif 'en' in sub_lang_list:
391             sub_lang = 'en'
392         else:
393             sub_lang = list(sub_lang_list.keys())[0]
394         if not sub_lang in sub_lang_list:
395             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
396
397         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
398         return [subtitle]
399
400     def _extract_all_subtitles(self, video_id):
401         sub_lang_list = self._get_available_subtitles(video_id)
402         sub_format = self._downloader.params.get('subtitlesformat')
403         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
404             return [(sub_lang_list[0], None, None)]
405         subtitles = []
406         for sub_lang in sub_lang_list:
407             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
408             subtitles.append(subtitle)
409         return subtitles
410
411     def _print_formats(self, formats):
412         print('Available formats:')
413         for x in formats:
414             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
415
416     def _real_initialize(self):
417         if self._downloader is None:
418             return
419
420         username = None
421         password = None
422         downloader_params = self._downloader.params
423
424         # Attempt to use provided username and password or .netrc data
425         if downloader_params.get('username', None) is not None:
426             username = downloader_params['username']
427             password = downloader_params['password']
428         elif downloader_params.get('usenetrc', False):
429             try:
430                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431                 if info is not None:
432                     username = info[0]
433                     password = info[2]
434                 else:
435                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
436             except (IOError, netrc.NetrcParseError) as err:
437                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
438                 return
439
440         # Set language
441         request = compat_urllib_request.Request(self._LANG_URL)
442         try:
443             self.report_lang()
444             compat_urllib_request.urlopen(request).read()
445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
447             return
448
449         # No authentication to be performed
450         if username is None:
451             return
452
453         request = compat_urllib_request.Request(self._LOGIN_URL)
454         try:
455             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
458             return
459
460         galx = None
461         dsh = None
462         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
463         if match:
464           galx = match.group(1)
465
466         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
467         if match:
468           dsh = match.group(1)
469
470         # Log in
471         login_form_strs = {
472                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
473                 u'Email': username,
474                 u'GALX': galx,
475                 u'Passwd': password,
476                 u'PersistentCookie': u'yes',
477                 u'_utf8': u'霱',
478                 u'bgresponse': u'js_disabled',
479                 u'checkConnection': u'',
480                 u'checkedDomains': u'youtube',
481                 u'dnConn': u'',
482                 u'dsh': dsh,
483                 u'pstMsg': u'0',
484                 u'rmShown': u'1',
485                 u'secTok': u'',
486                 u'signIn': u'Sign in',
487                 u'timeStmp': u'',
488                 u'service': u'youtube',
489                 u'uilel': u'3',
490                 u'hl': u'en_US',
491         }
492         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
493         # chokes on unicode
494         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
495         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
496         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
497         try:
498             self.report_login()
499             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
500             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
501                 self._downloader.report_warning(u'unable to log in: bad username or password')
502                 return
503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
505             return
506
507         # Confirm age
508         age_form = {
509                 'next_url':     '/',
510                 'action_confirm':   'Confirm',
511                 }
512         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
513         try:
514             self.report_age_confirmation()
515             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
518
519     def _extract_id(self, url):
520         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
521         if mobj is None:
522             raise ExtractorError(u'Invalid URL: %s' % url)
523         video_id = mobj.group(2)
524         return video_id
525
526     def _real_extract(self, url):
527         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
528         mobj = re.search(self._NEXT_URL_RE, url)
529         if mobj:
530             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
531         video_id = self._extract_id(url)
532
533         # Get video webpage
534         self.report_video_webpage_download(video_id)
535         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
536         request = compat_urllib_request.Request(url)
537         try:
538             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
539         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
540             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
541
542         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
543
544         # Attempt to extract SWF player URL
545         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
546         if mobj is not None:
547             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
548         else:
549             player_url = None
550
551         # Get video info
552         self.report_video_info_webpage_download(video_id)
553         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
554             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
555                     % (video_id, el_type))
556             video_info_webpage = self._download_webpage(video_info_url, video_id,
557                                     note=False,
558                                     errnote='unable to download video info webpage')
559             video_info = compat_parse_qs(video_info_webpage)
560             if 'token' in video_info:
561                 break
562         if 'token' not in video_info:
563             if 'reason' in video_info:
564                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
565             else:
566                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
567
568         # Check for "rental" videos
569         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
570             raise ExtractorError(u'"rental" videos not supported')
571
572         # Start extracting information
573         self.report_information_extraction(video_id)
574
575         # uploader
576         if 'author' not in video_info:
577             raise ExtractorError(u'Unable to extract uploader name')
578         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
579
580         # uploader_id
581         video_uploader_id = None
582         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
583         if mobj is not None:
584             video_uploader_id = mobj.group(1)
585         else:
586             self._downloader.report_warning(u'unable to extract uploader nickname')
587
588         # title
589         if 'title' not in video_info:
590             raise ExtractorError(u'Unable to extract video title')
591         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
592
593         # thumbnail image
594         if 'thumbnail_url' not in video_info:
595             self._downloader.report_warning(u'unable to extract video thumbnail')
596             video_thumbnail = ''
597         else:   # don't panic if we can't find it
598             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
599
600         # upload date
601         upload_date = None
602         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
603         if mobj is not None:
604             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
605             upload_date = unified_strdate(upload_date)
606
607         # description
608         video_description = get_element_by_id("eow-description", video_webpage)
609         if video_description:
610             video_description = clean_html(video_description)
611         else:
612             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
613             if fd_mobj:
614                 video_description = unescapeHTML(fd_mobj.group(1))
615             else:
616                 video_description = u''
617
618         # subtitles
619         video_subtitles = None
620
621         if self._downloader.params.get('writesubtitles', False):
622             video_subtitles = self._extract_subtitle(video_id)
623             if video_subtitles:
624                 (sub_error, sub_lang, sub) = video_subtitles[0]
625                 if sub_error:
626                     self._downloader.report_error(sub_error)
627
628         if self._downloader.params.get('allsubtitles', False):
629             video_subtitles = self._extract_all_subtitles(video_id)
630             for video_subtitle in video_subtitles:
631                 (sub_error, sub_lang, sub) = video_subtitle
632                 if sub_error:
633                     self._downloader.report_error(sub_error)
634
635         if self._downloader.params.get('listsubtitles', False):
636             sub_lang_list = self._list_available_subtitles(video_id)
637             return
638
639         if 'length_seconds' not in video_info:
640             self._downloader.report_warning(u'unable to extract video duration')
641             video_duration = ''
642         else:
643             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
644
645         # token
646         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
647
648         # Decide which formats to download
649         req_format = self._downloader.params.get('format', None)
650
651         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
652             self.report_rtmp_download()
653             video_url_list = [(None, video_info['conn'][0])]
654         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
655             url_map = {}
656             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
657                 url_data = compat_parse_qs(url_data_str)
658                 if 'itag' in url_data and 'url' in url_data:
659                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
660                     if not 'ratebypass' in url: url += '&ratebypass=yes'
661                     url_map[url_data['itag'][0]] = url
662
663             format_limit = self._downloader.params.get('format_limit', None)
664             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
665             if format_limit is not None and format_limit in available_formats:
666                 format_list = available_formats[available_formats.index(format_limit):]
667             else:
668                 format_list = available_formats
669             existing_formats = [x for x in format_list if x in url_map]
670             if len(existing_formats) == 0:
671                 raise ExtractorError(u'no known formats available for video')
672             if self._downloader.params.get('listformats', None):
673                 self._print_formats(existing_formats)
674                 return
675             if req_format is None or req_format == 'best':
676                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
677             elif req_format == 'worst':
678                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
679             elif req_format in ('-1', 'all'):
680                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
681             else:
682                 # Specific formats. We pick the first in a slash-delimeted sequence.
683                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
684                 req_formats = req_format.split('/')
685                 video_url_list = None
686                 for rf in req_formats:
687                     if rf in url_map:
688                         video_url_list = [(rf, url_map[rf])]
689                         break
690                 if video_url_list is None:
691                     raise ExtractorError(u'requested format not available')
692         else:
693             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
694
695         results = []
696         for format_param, video_real_url in video_url_list:
697             # Extension
698             video_extension = self._video_extensions.get(format_param, 'flv')
699
700             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
701                                               self._video_dimensions.get(format_param, '???'))
702
703             results.append({
704                 'id':       video_id,
705                 'url':      video_real_url,
706                 'uploader': video_uploader,
707                 'uploader_id': video_uploader_id,
708                 'upload_date':  upload_date,
709                 'title':    video_title,
710                 'ext':      video_extension,
711                 'format':   video_format,
712                 'thumbnail':    video_thumbnail,
713                 'description':  video_description,
714                 'player_url':   player_url,
715                 'subtitles':    video_subtitles,
716                 'duration':     video_duration
717             })
718         return results
719
720
721 class MetacafeIE(InfoExtractor):
722     """Information Extractor for metacafe.com."""
723
724     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
725     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
726     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
727     IE_NAME = u'metacafe'
728
729     def report_disclaimer(self):
730         """Report disclaimer retrieval."""
731         self.to_screen(u'Retrieving disclaimer')
732
733     def _real_initialize(self):
734         # Retrieve disclaimer
735         request = compat_urllib_request.Request(self._DISCLAIMER)
736         try:
737             self.report_disclaimer()
738             disclaimer = compat_urllib_request.urlopen(request).read()
739         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
740             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
741
742         # Confirm age
743         disclaimer_form = {
744             'filters': '0',
745             'submit': "Continue - I'm over 18",
746             }
747         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
748         try:
749             self.report_age_confirmation()
750             disclaimer = compat_urllib_request.urlopen(request).read()
751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
752             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
753
754     def _real_extract(self, url):
755         # Extract id and simplified title from URL
756         mobj = re.match(self._VALID_URL, url)
757         if mobj is None:
758             raise ExtractorError(u'Invalid URL: %s' % url)
759
760         video_id = mobj.group(1)
761
762         # Check if video comes from YouTube
763         mobj2 = re.match(r'^yt-(.*)$', video_id)
764         if mobj2 is not None:
765             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
766
767         # Retrieve video webpage to extract further information
768         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
769
770         # Extract URL, uploader and title from webpage
771         self.report_extraction(video_id)
772         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
773         if mobj is not None:
774             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
775             video_extension = mediaURL[-3:]
776
777             # Extract gdaKey if available
778             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
779             if mobj is None:
780                 video_url = mediaURL
781             else:
782                 gdaKey = mobj.group(1)
783                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
784         else:
785             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
786             if mobj is None:
787                 raise ExtractorError(u'Unable to extract media URL')
788             vardict = compat_parse_qs(mobj.group(1))
789             if 'mediaData' not in vardict:
790                 raise ExtractorError(u'Unable to extract media URL')
791             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
792             if mobj is None:
793                 raise ExtractorError(u'Unable to extract media URL')
794             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
795             video_extension = mediaURL[-3:]
796             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
797
798         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
799         if mobj is None:
800             raise ExtractorError(u'Unable to extract title')
801         video_title = mobj.group(1).decode('utf-8')
802
803         mobj = re.search(r'submitter=(.*?);', webpage)
804         if mobj is None:
805             raise ExtractorError(u'Unable to extract uploader nickname')
806         video_uploader = mobj.group(1)
807
808         return [{
809             'id':       video_id.decode('utf-8'),
810             'url':      video_url.decode('utf-8'),
811             'uploader': video_uploader.decode('utf-8'),
812             'upload_date':  None,
813             'title':    video_title,
814             'ext':      video_extension.decode('utf-8'),
815         }]
816
817 class DailymotionIE(InfoExtractor):
818     """Information Extractor for Dailymotion"""
819
820     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
821     IE_NAME = u'dailymotion'
822
823     def _real_extract(self, url):
824         # Extract id and simplified title from URL
825         mobj = re.match(self._VALID_URL, url)
826         if mobj is None:
827             raise ExtractorError(u'Invalid URL: %s' % url)
828
829         video_id = mobj.group(1).split('_')[0].split('?')[0]
830
831         video_extension = 'mp4'
832
833         # Retrieve video webpage to extract further information
834         request = compat_urllib_request.Request(url)
835         request.add_header('Cookie', 'family_filter=off')
836         webpage = self._download_webpage(request, video_id)
837
838         # Extract URL, uploader and title from webpage
839         self.report_extraction(video_id)
840         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
841         if mobj is None:
842             raise ExtractorError(u'Unable to extract media URL')
843         flashvars = compat_urllib_parse.unquote(mobj.group(1))
844
845         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
846             if key in flashvars:
847                 max_quality = key
848                 self.to_screen(u'Using %s' % key)
849                 break
850         else:
851             raise ExtractorError(u'Unable to extract video URL')
852
853         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
854         if mobj is None:
855             raise ExtractorError(u'Unable to extract video URL')
856
857         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
858
859         # TODO: support choosing qualities
860
861         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
862         if mobj is None:
863             raise ExtractorError(u'Unable to extract title')
864         video_title = unescapeHTML(mobj.group('title'))
865
866         video_uploader = None
867         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
868         if mobj is None:
869             # lookin for official user
870             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
871             if mobj_official is None:
872                 self._downloader.report_warning(u'unable to extract uploader nickname')
873             else:
874                 video_uploader = mobj_official.group(1)
875         else:
876             video_uploader = mobj.group(1)
877
878         video_upload_date = None
879         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
880         if mobj is not None:
881             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
882
883         return [{
884             'id':       video_id,
885             'url':      video_url,
886             'uploader': video_uploader,
887             'upload_date':  video_upload_date,
888             'title':    video_title,
889             'ext':      video_extension,
890         }]
891
892
893 class PhotobucketIE(InfoExtractor):
894     """Information extractor for photobucket.com."""
895
896     # TODO: the original _VALID_URL was:
897     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
898     # Check if it's necessary to keep the old extracion process
899     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
900     IE_NAME = u'photobucket'
901
902     def _real_extract(self, url):
903         # Extract id from URL
904         mobj = re.match(self._VALID_URL, url)
905         if mobj is None:
906             raise ExtractorError(u'Invalid URL: %s' % url)
907
908         video_id = mobj.group('id')
909
910         video_extension = mobj.group('ext')
911
912         # Retrieve video webpage to extract further information
913         webpage = self._download_webpage(url, video_id)
914
915         # Extract URL, uploader, and title from webpage
916         self.report_extraction(video_id)
917         # We try first by looking the javascript code:
918         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
919         if mobj is not None:
920             info = json.loads(mobj.group('json'))
921             return [{
922                 'id':       video_id,
923                 'url':      info[u'downloadUrl'],
924                 'uploader': info[u'username'],
925                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
926                 'title':    info[u'title'],
927                 'ext':      video_extension,
928                 'thumbnail': info[u'thumbUrl'],
929             }]
930
931         # We try looking in other parts of the webpage
932         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
933         if mobj is None:
934             raise ExtractorError(u'Unable to extract media URL')
935         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
936
937         video_url = mediaURL
938
939         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
940         if mobj is None:
941             raise ExtractorError(u'Unable to extract title')
942         video_title = mobj.group(1).decode('utf-8')
943
944         video_uploader = mobj.group(2).decode('utf-8')
945
946         return [{
947             'id':       video_id.decode('utf-8'),
948             'url':      video_url.decode('utf-8'),
949             'uploader': video_uploader,
950             'upload_date':  None,
951             'title':    video_title,
952             'ext':      video_extension.decode('utf-8'),
953         }]
954
955
956 class YahooIE(InfoExtractor):
957     """Information extractor for screen.yahoo.com."""
958     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
959
960     def _real_extract(self, url):
961         mobj = re.match(self._VALID_URL, url)
962         if mobj is None:
963             raise ExtractorError(u'Invalid URL: %s' % url)
964         video_id = mobj.group('id')
965         webpage = self._download_webpage(url, video_id)
966         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
967
968         if m_id is None: 
969             # TODO: Check which url parameters are required
970             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
971             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
972             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
973                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
974                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
975                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
976                         '''
977             self.report_extraction(video_id)
978             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
979             if m_info is None:
980                 raise ExtractorError(u'Unable to extract video info')
981             video_title = m_info.group('title')
982             video_description = m_info.group('description')
983             video_thumb = m_info.group('thumb')
984             video_date = m_info.group('date')
985             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
986     
987             # TODO: Find a way to get mp4 videos
988             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
989             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
990             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
991             video_url = m_rest.group('url')
992             video_path = m_rest.group('path')
993             if m_rest is None:
994                 raise ExtractorError(u'Unable to extract video url')
995
996         else: # We have to use a different method if another id is defined
997             long_id = m_id.group('new_id')
998             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
999             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1000             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1001             info = json.loads(json_str)
1002             res = info[u'query'][u'results'][u'mediaObj'][0]
1003             stream = res[u'streams'][0]
1004             video_path = stream[u'path']
1005             video_url = stream[u'host']
1006             meta = res[u'meta']
1007             video_title = meta[u'title']
1008             video_description = meta[u'description']
1009             video_thumb = meta[u'thumbnail']
1010             video_date = None # I can't find it
1011
1012         info_dict = {
1013                      'id': video_id,
1014                      'url': video_url,
1015                      'play_path': video_path,
1016                      'title':video_title,
1017                      'description': video_description,
1018                      'thumbnail': video_thumb,
1019                      'upload_date': video_date,
1020                      'ext': 'flv',
1021                      }
1022         return info_dict
1023
1024 class VimeoIE(InfoExtractor):
1025     """Information extractor for vimeo.com."""
1026
1027     # _VALID_URL matches Vimeo URLs
1028     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1029     IE_NAME = u'vimeo'
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             raise ExtractorError(u'Invalid URL: %s' % url)
1036
1037         video_id = mobj.group('id')
1038         if not mobj.group('proto'):
1039             url = 'https://' + url
1040         if mobj.group('direct_link'):
1041             url = 'https://vimeo.com/' + video_id
1042
1043         # Retrieve video webpage to extract further information
1044         request = compat_urllib_request.Request(url, None, std_headers)
1045         webpage = self._download_webpage(request, video_id)
1046
1047         # Now we begin extracting as much information as we can from what we
1048         # retrieved. First we extract the information common to all extractors,
1049         # and latter we extract those that are Vimeo specific.
1050         self.report_extraction(video_id)
1051
1052         # Extract the config JSON
1053         try:
1054             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055             config = json.loads(config)
1056         except:
1057             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1058                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1059             else:
1060                 raise ExtractorError(u'Unable to extract info section')
1061
1062         # Extract title
1063         video_title = config["video"]["title"]
1064
1065         # Extract uploader and uploader_id
1066         video_uploader = config["video"]["owner"]["name"]
1067         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1068
1069         # Extract video thumbnail
1070         video_thumbnail = config["video"]["thumbnail"]
1071
1072         # Extract video description
1073         video_description = get_element_by_attribute("itemprop", "description", webpage)
1074         if video_description: video_description = clean_html(video_description)
1075         else: video_description = u''
1076
1077         # Extract upload date
1078         video_upload_date = None
1079         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1080         if mobj is not None:
1081             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1082
1083         # Vimeo specific: extract request signature and timestamp
1084         sig = config['request']['signature']
1085         timestamp = config['request']['timestamp']
1086
1087         # Vimeo specific: extract video codec and quality information
1088         # First consider quality, then codecs, then take everything
1089         # TODO bind to format param
1090         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1091         files = { 'hd': [], 'sd': [], 'other': []}
1092         for codec_name, codec_extension in codecs:
1093             if codec_name in config["video"]["files"]:
1094                 if 'hd' in config["video"]["files"][codec_name]:
1095                     files['hd'].append((codec_name, codec_extension, 'hd'))
1096                 elif 'sd' in config["video"]["files"][codec_name]:
1097                     files['sd'].append((codec_name, codec_extension, 'sd'))
1098                 else:
1099                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1100
1101         for quality in ('hd', 'sd', 'other'):
1102             if len(files[quality]) > 0:
1103                 video_quality = files[quality][0][2]
1104                 video_codec = files[quality][0][0]
1105                 video_extension = files[quality][0][1]
1106                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1107                 break
1108         else:
1109             raise ExtractorError(u'No known codec found')
1110
1111         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1113
1114         return [{
1115             'id':       video_id,
1116             'url':      video_url,
1117             'uploader': video_uploader,
1118             'uploader_id': video_uploader_id,
1119             'upload_date':  video_upload_date,
1120             'title':    video_title,
1121             'ext':      video_extension,
1122             'thumbnail':    video_thumbnail,
1123             'description':  video_description,
1124         }]
1125
1126
1127 class ArteTvIE(InfoExtractor):
1128     """arte.tv information extractor."""
1129
1130     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1131     _LIVE_URL = r'index-[0-9]+\.html$'
1132
1133     IE_NAME = u'arte.tv'
1134
1135     def fetch_webpage(self, url):
1136         request = compat_urllib_request.Request(url)
1137         try:
1138             self.report_download_webpage(url)
1139             webpage = compat_urllib_request.urlopen(request).read()
1140         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1141             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1142         except ValueError as err:
1143             raise ExtractorError(u'Invalid URL: %s' % url)
1144         return webpage
1145
1146     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1147         page = self.fetch_webpage(url)
1148         mobj = re.search(regex, page, regexFlags)
1149         info = {}
1150
1151         if mobj is None:
1152             raise ExtractorError(u'Invalid URL: %s' % url)
1153
1154         for (i, key, err) in matchTuples:
1155             if mobj.group(i) is None:
1156                 raise ExtractorError(err)
1157             else:
1158                 info[key] = mobj.group(i)
1159
1160         return info
1161
1162     def extractLiveStream(self, url):
1163         video_lang = url.split('/')[-4]
1164         info = self.grep_webpage(
1165             url,
1166             r'src="(.*?/videothek_js.*?\.js)',
1167             0,
1168             [
1169                 (1, 'url', u'Invalid URL: %s' % url)
1170             ]
1171         )
1172         http_host = url.split('/')[2]
1173         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1174         info = self.grep_webpage(
1175             next_url,
1176             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1177                 '(http://.*?\.swf).*?' +
1178                 '(rtmp://.*?)\'',
1179             re.DOTALL,
1180             [
1181                 (1, 'path',   u'could not extract video path: %s' % url),
1182                 (2, 'player', u'could not extract video player: %s' % url),
1183                 (3, 'url',    u'could not extract video url: %s' % url)
1184             ]
1185         )
1186         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1187
1188     def extractPlus7Stream(self, url):
1189         video_lang = url.split('/')[-3]
1190         info = self.grep_webpage(
1191             url,
1192             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1193             0,
1194             [
1195                 (1, 'url', u'Invalid URL: %s' % url)
1196             ]
1197         )
1198         next_url = compat_urllib_parse.unquote(info.get('url'))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1202             0,
1203             [
1204                 (1, 'url', u'Could not find <video> tag: %s' % url)
1205             ]
1206         )
1207         next_url = compat_urllib_parse.unquote(info.get('url'))
1208
1209         info = self.grep_webpage(
1210             next_url,
1211             r'<video id="(.*?)".*?>.*?' +
1212                 '<name>(.*?)</name>.*?' +
1213                 '<dateVideo>(.*?)</dateVideo>.*?' +
1214                 '<url quality="hd">(.*?)</url>',
1215             re.DOTALL,
1216             [
1217                 (1, 'id',    u'could not extract video id: %s' % url),
1218                 (2, 'title', u'could not extract video title: %s' % url),
1219                 (3, 'date',  u'could not extract video date: %s' % url),
1220                 (4, 'url',   u'could not extract video url: %s' % url)
1221             ]
1222         )
1223
1224         return {
1225             'id':           info.get('id'),
1226             'url':          compat_urllib_parse.unquote(info.get('url')),
1227             'uploader':     u'arte.tv',
1228             'upload_date':  unified_strdate(info.get('date')),
1229             'title':        info.get('title').decode('utf-8'),
1230             'ext':          u'mp4',
1231             'format':       u'NA',
1232             'player_url':   None,
1233         }
1234
1235     def _real_extract(self, url):
1236         video_id = url.split('/')[-1]
1237         self.report_extraction(video_id)
1238
1239         if re.search(self._LIVE_URL, video_id) is not None:
1240             self.extractLiveStream(url)
1241             return
1242         else:
1243             info = self.extractPlus7Stream(url)
1244
1245         return [info]
1246
1247
1248 class GenericIE(InfoExtractor):
1249     """Generic last-resort information extractor."""
1250
1251     _VALID_URL = r'.*'
1252     IE_NAME = u'generic'
1253
1254     def report_download_webpage(self, video_id):
1255         """Report webpage download."""
1256         if not self._downloader.params.get('test', False):
1257             self._downloader.report_warning(u'Falling back on generic information extractor.')
1258         super(GenericIE, self).report_download_webpage(video_id)
1259
1260     def report_following_redirect(self, new_url):
1261         """Report information extraction."""
1262         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1263
1264     def _test_redirect(self, url):
1265         """Check if it is a redirect, like url shorteners, in case return the new url."""
1266         class HeadRequest(compat_urllib_request.Request):
1267             def get_method(self):
1268                 return "HEAD"
1269
1270         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1271             """
1272             Subclass the HTTPRedirectHandler to make it use our
1273             HeadRequest also on the redirected URL
1274             """
1275             def redirect_request(self, req, fp, code, msg, headers, newurl):
1276                 if code in (301, 302, 303, 307):
1277                     newurl = newurl.replace(' ', '%20')
1278                     newheaders = dict((k,v) for k,v in req.headers.items()
1279                                       if k.lower() not in ("content-length", "content-type"))
1280                     return HeadRequest(newurl,
1281                                        headers=newheaders,
1282                                        origin_req_host=req.get_origin_req_host(),
1283                                        unverifiable=True)
1284                 else:
1285                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1286
1287         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1288             """
1289             Fallback to GET if HEAD is not allowed (405 HTTP error)
1290             """
1291             def http_error_405(self, req, fp, code, msg, headers):
1292                 fp.read()
1293                 fp.close()
1294
1295                 newheaders = dict((k,v) for k,v in req.headers.items()
1296                                   if k.lower() not in ("content-length", "content-type"))
1297                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1298                                                  headers=newheaders,
1299                                                  origin_req_host=req.get_origin_req_host(),
1300                                                  unverifiable=True))
1301
1302         # Build our opener
1303         opener = compat_urllib_request.OpenerDirector()
1304         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1305                         HTTPMethodFallback, HEADRedirectHandler,
1306                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1307             opener.add_handler(handler())
1308
1309         response = opener.open(HeadRequest(url))
1310         if response is None:
1311             raise ExtractorError(u'Invalid URL protocol')
1312         new_url = response.geturl()
1313
1314         if url == new_url:
1315             return False
1316
1317         self.report_following_redirect(new_url)
1318         return new_url
1319
1320     def _real_extract(self, url):
1321         new_url = self._test_redirect(url)
1322         if new_url: return [self.url_result(new_url)]
1323
1324         video_id = url.split('/')[-1]
1325         try:
1326             webpage = self._download_webpage(url, video_id)
1327         except ValueError as err:
1328             # since this is the last-resort InfoExtractor, if
1329             # this error is thrown, it'll be thrown here
1330             raise ExtractorError(u'Invalid URL: %s' % url)
1331
1332         self.report_extraction(video_id)
1333         # Start with something easy: JW Player in SWFObject
1334         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1335         if mobj is None:
1336             # Broaden the search a little bit
1337             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1338         if mobj is None:
1339             # Broaden the search a little bit: JWPlayer JS loader
1340             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1341         if mobj is None:
1342             raise ExtractorError(u'Invalid URL: %s' % url)
1343
1344         # It's possible that one of the regexes
1345         # matched, but returned an empty group:
1346         if mobj.group(1) is None:
1347             raise ExtractorError(u'Invalid URL: %s' % url)
1348
1349         video_url = compat_urllib_parse.unquote(mobj.group(1))
1350         video_id = os.path.basename(video_url)
1351
1352         # here's a fun little line of code for you:
1353         video_extension = os.path.splitext(video_id)[1][1:]
1354         video_id = os.path.splitext(video_id)[0]
1355
1356         # it's tempting to parse this further, but you would
1357         # have to take into account all the variations like
1358         #   Video Title - Site Name
1359         #   Site Name | Video Title
1360         #   Video Title - Tagline | Site Name
1361         # and so on and so forth; it's just not practical
1362         mobj = re.search(r'<title>(.*)</title>', webpage)
1363         if mobj is None:
1364             raise ExtractorError(u'Unable to extract title')
1365         video_title = mobj.group(1)
1366
1367         # video uploader is domain name
1368         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1369         if mobj is None:
1370             raise ExtractorError(u'Unable to extract title')
1371         video_uploader = mobj.group(1)
1372
1373         return [{
1374             'id':       video_id,
1375             'url':      video_url,
1376             'uploader': video_uploader,
1377             'upload_date':  None,
1378             'title':    video_title,
1379             'ext':      video_extension,
1380         }]
1381
1382
1383 class YoutubeSearchIE(SearchInfoExtractor):
1384     """Information Extractor for YouTube search queries."""
1385     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386     _MAX_RESULTS = 1000
1387     IE_NAME = u'youtube:search'
1388     _SEARCH_KEY = 'ytsearch'
1389
1390     def report_download_page(self, query, pagenum):
1391         """Report attempt to download search page with given number."""
1392         query = query.decode(preferredencoding())
1393         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1394
1395     def _get_n_results(self, query, n):
1396         """Get a specified number of results for a query"""
1397
1398         video_ids = []
1399         pagenum = 0
1400         limit = n
1401
1402         while (50 * pagenum) < limit:
1403             self.report_download_page(query, pagenum+1)
1404             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1405             request = compat_urllib_request.Request(result_url)
1406             try:
1407                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1408             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1409                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1410             api_response = json.loads(data)['data']
1411
1412             if not 'items' in api_response:
1413                 raise ExtractorError(u'[youtube] No video results')
1414
1415             new_ids = list(video['id'] for video in api_response['items'])
1416             video_ids += new_ids
1417
1418             limit = min(n, api_response['totalItems'])
1419             pagenum += 1
1420
1421         if len(video_ids) > n:
1422             video_ids = video_ids[:n]
1423         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1424         return self.playlist_result(videos, query)
1425
1426
1427 class GoogleSearchIE(SearchInfoExtractor):
1428     """Information Extractor for Google Video search queries."""
1429     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1430     _MAX_RESULTS = 1000
1431     IE_NAME = u'video.google:search'
1432     _SEARCH_KEY = 'gvsearch'
1433
1434     def _get_n_results(self, query, n):
1435         """Get a specified number of results for a query"""
1436
1437         res = {
1438             '_type': 'playlist',
1439             'id': query,
1440             'entries': []
1441         }
1442
1443         for pagenum in itertools.count(1):
1444             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1445             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1446                                              note='Downloading result page ' + str(pagenum))
1447
1448             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1449                 e = {
1450                     '_type': 'url',
1451                     'url': mobj.group(1)
1452                 }
1453                 res['entries'].append(e)
1454
1455             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1456                 return res
1457
1458 class YahooSearchIE(SearchInfoExtractor):
1459     """Information Extractor for Yahoo! Video search queries."""
1460
1461     _MAX_RESULTS = 1000
1462     IE_NAME = u'screen.yahoo:search'
1463     _SEARCH_KEY = 'yvsearch'
1464
1465     def _get_n_results(self, query, n):
1466         """Get a specified number of results for a query"""
1467
1468         res = {
1469             '_type': 'playlist',
1470             'id': query,
1471             'entries': []
1472         }
1473         for pagenum in itertools.count(0): 
1474             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1475             webpage = self._download_webpage(result_url, query,
1476                                              note='Downloading results page '+str(pagenum+1))
1477             info = json.loads(webpage)
1478             m = info[u'm']
1479             results = info[u'results']
1480
1481             for (i, r) in enumerate(results):
1482                 if (pagenum * 30) +i >= n:
1483                     break
1484                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1485                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1486                 res['entries'].append(e)
1487             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1488                 break
1489
1490         return res
1491
1492
1493 class YoutubePlaylistIE(InfoExtractor):
1494     """Information Extractor for YouTube playlists."""
1495
1496     _VALID_URL = r"""(?:
1497                         (?:https?://)?
1498                         (?:\w+\.)?
1499                         youtube\.com/
1500                         (?:
1501                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1502                            \? (?:.*?&)*? (?:p|a|list)=
1503                         |  p/
1504                         )
1505                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1506                         .*
1507                      |
1508                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1509                      )"""
1510     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1511     _MAX_RESULTS = 50
1512     IE_NAME = u'youtube:playlist'
1513
1514     @classmethod
1515     def suitable(cls, url):
1516         """Receives a URL and returns True if suitable for this IE."""
1517         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1518
1519     def _real_extract(self, url):
1520         # Extract playlist id
1521         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1522         if mobj is None:
1523             raise ExtractorError(u'Invalid URL: %s' % url)
1524
1525         # Download playlist videos from API
1526         playlist_id = mobj.group(1) or mobj.group(2)
1527         page_num = 1
1528         videos = []
1529
1530         while True:
1531             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1532             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1533
1534             try:
1535                 response = json.loads(page)
1536             except ValueError as err:
1537                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1538
1539             if 'feed' not in response:
1540                 raise ExtractorError(u'Got a malformed response from YouTube API')
1541             playlist_title = response['feed']['title']['$t']
1542             if 'entry' not in response['feed']:
1543                 # Number of videos is a multiple of self._MAX_RESULTS
1544                 break
1545
1546             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1547                         for entry in response['feed']['entry']
1548                         if 'content' in entry ]
1549
1550             if len(response['feed']['entry']) < self._MAX_RESULTS:
1551                 break
1552             page_num += 1
1553
1554         videos = [v[1] for v in sorted(videos)]
1555
1556         url_results = [self.url_result(url, 'Youtube') for url in videos]
1557         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1558
1559
1560 class YoutubeChannelIE(InfoExtractor):
1561     """Information Extractor for YouTube channels."""
1562
1563     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1564     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1565     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1566     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1567     IE_NAME = u'youtube:channel'
1568
1569     def extract_videos_from_page(self, page):
1570         ids_in_page = []
1571         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1572             if mobj.group(1) not in ids_in_page:
1573                 ids_in_page.append(mobj.group(1))
1574         return ids_in_page
1575
1576     def _real_extract(self, url):
1577         # Extract channel id
1578         mobj = re.match(self._VALID_URL, url)
1579         if mobj is None:
1580             raise ExtractorError(u'Invalid URL: %s' % url)
1581
1582         # Download channel page
1583         channel_id = mobj.group(1)
1584         video_ids = []
1585         pagenum = 1
1586
1587         url = self._TEMPLATE_URL % (channel_id, pagenum)
1588         page = self._download_webpage(url, channel_id,
1589                                       u'Downloading page #%s' % pagenum)
1590
1591         # Extract video identifiers
1592         ids_in_page = self.extract_videos_from_page(page)
1593         video_ids.extend(ids_in_page)
1594
1595         # Download any subsequent channel pages using the json-based channel_ajax query
1596         if self._MORE_PAGES_INDICATOR in page:
1597             while True:
1598                 pagenum = pagenum + 1
1599
1600                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1601                 page = self._download_webpage(url, channel_id,
1602                                               u'Downloading page #%s' % pagenum)
1603
1604                 page = json.loads(page)
1605
1606                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1607                 video_ids.extend(ids_in_page)
1608
1609                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1610                     break
1611
1612         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1613
1614         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1615         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1616         return [self.playlist_result(url_entries, channel_id)]
1617
1618
1619 class YoutubeUserIE(InfoExtractor):
1620     """Information Extractor for YouTube users."""
1621
1622     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1623     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1624     _GDATA_PAGE_SIZE = 50
1625     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1626     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1627     IE_NAME = u'youtube:user'
1628
1629     def _real_extract(self, url):
1630         # Extract username
1631         mobj = re.match(self._VALID_URL, url)
1632         if mobj is None:
1633             raise ExtractorError(u'Invalid URL: %s' % url)
1634
1635         username = mobj.group(1)
1636
1637         # Download video ids using YouTube Data API. Result size per
1638         # query is limited (currently to 50 videos) so we need to query
1639         # page by page until there are no video ids - it means we got
1640         # all of them.
1641
1642         video_ids = []
1643         pagenum = 0
1644
1645         while True:
1646             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1647
1648             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1649             page = self._download_webpage(gdata_url, username,
1650                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1651
1652             # Extract video identifiers
1653             ids_in_page = []
1654
1655             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656                 if mobj.group(1) not in ids_in_page:
1657                     ids_in_page.append(mobj.group(1))
1658
1659             video_ids.extend(ids_in_page)
1660
1661             # A little optimization - if current page is not
1662             # "full", ie. does not contain PAGE_SIZE video ids then
1663             # we can assume that this page is the last one - there
1664             # are no more ids on further pages - no need to query
1665             # again.
1666
1667             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1668                 break
1669
1670             pagenum += 1
1671
1672         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1673         url_results = [self.url_result(url, 'Youtube') for url in urls]
1674         return [self.playlist_result(url_results, playlist_title = username)]
1675
1676
1677 class BlipTVUserIE(InfoExtractor):
1678     """Information Extractor for blip.tv users."""
1679
1680     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1681     _PAGE_SIZE = 12
1682     IE_NAME = u'blip.tv:user'
1683
1684     def _real_extract(self, url):
1685         # Extract username
1686         mobj = re.match(self._VALID_URL, url)
1687         if mobj is None:
1688             raise ExtractorError(u'Invalid URL: %s' % url)
1689
1690         username = mobj.group(1)
1691
1692         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1693
1694         page = self._download_webpage(url, username, u'Downloading user page')
1695         mobj = re.search(r'data-users-id="([^"]+)"', page)
1696         page_base = page_base % mobj.group(1)
1697
1698
1699         # Download video ids using BlipTV Ajax calls. Result size per
1700         # query is limited (currently to 12 videos) so we need to query
1701         # page by page until there are no video ids - it means we got
1702         # all of them.
1703
1704         video_ids = []
1705         pagenum = 1
1706
1707         while True:
1708             url = page_base + "&page=" + str(pagenum)
1709             page = self._download_webpage(url, username,
1710                                           u'Downloading video ids from page %d' % pagenum)
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714
1715             for mobj in re.finditer(r'href="/([^"]+)"', page):
1716                 if mobj.group(1) not in ids_in_page:
1717                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1718
1719             video_ids.extend(ids_in_page)
1720
1721             # A little optimization - if current page is not
1722             # "full", ie. does not contain PAGE_SIZE video ids then
1723             # we can assume that this page is the last one - there
1724             # are no more ids on further pages - no need to query
1725             # again.
1726
1727             if len(ids_in_page) < self._PAGE_SIZE:
1728                 break
1729
1730             pagenum += 1
1731
1732         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1733         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1734         return [self.playlist_result(url_entries, playlist_title = username)]
1735
1736
1737 class DepositFilesIE(InfoExtractor):
1738     """Information extractor for depositfiles.com"""
1739
1740     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1741
1742     def _real_extract(self, url):
1743         file_id = url.split('/')[-1]
1744         # Rebuild url in english locale
1745         url = 'http://depositfiles.com/en/files/' + file_id
1746
1747         # Retrieve file webpage with 'Free download' button pressed
1748         free_download_indication = { 'gateway_result' : '1' }
1749         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1750         try:
1751             self.report_download_webpage(file_id)
1752             webpage = compat_urllib_request.urlopen(request).read()
1753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1754             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1755
1756         # Search for the real file URL
1757         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1758         if (mobj is None) or (mobj.group(1) is None):
1759             # Try to figure out reason of the error.
1760             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1761             if (mobj is not None) and (mobj.group(1) is not None):
1762                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1763                 raise ExtractorError(u'%s' % restriction_message)
1764             else:
1765                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1766
1767         file_url = mobj.group(1)
1768         file_extension = os.path.splitext(file_url)[1][1:]
1769
1770         # Search for file title
1771         mobj = re.search(r'<b title="(.*?)">', webpage)
1772         if mobj is None:
1773             raise ExtractorError(u'Unable to extract title')
1774         file_title = mobj.group(1).decode('utf-8')
1775
1776         return [{
1777             'id':       file_id.decode('utf-8'),
1778             'url':      file_url.decode('utf-8'),
1779             'uploader': None,
1780             'upload_date':  None,
1781             'title':    file_title,
1782             'ext':      file_extension.decode('utf-8'),
1783         }]
1784
1785
1786 class FacebookIE(InfoExtractor):
1787     """Information Extractor for Facebook"""
1788
1789     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791     _NETRC_MACHINE = 'facebook'
1792     IE_NAME = u'facebook'
1793
1794     def report_login(self):
1795         """Report attempt to log in."""
1796         self.to_screen(u'Logging in')
1797
1798     def _real_initialize(self):
1799         if self._downloader is None:
1800             return
1801
1802         useremail = None
1803         password = None
1804         downloader_params = self._downloader.params
1805
1806         # Attempt to use provided username and password or .netrc data
1807         if downloader_params.get('username', None) is not None:
1808             useremail = downloader_params['username']
1809             password = downloader_params['password']
1810         elif downloader_params.get('usenetrc', False):
1811             try:
1812                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1813                 if info is not None:
1814                     useremail = info[0]
1815                     password = info[2]
1816                 else:
1817                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1818             except (IOError, netrc.NetrcParseError) as err:
1819                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1820                 return
1821
1822         if useremail is None:
1823             return
1824
1825         # Log in
1826         login_form = {
1827             'email': useremail,
1828             'pass': password,
1829             'login': 'Log+In'
1830             }
1831         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1832         try:
1833             self.report_login()
1834             login_results = compat_urllib_request.urlopen(request).read()
1835             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1836                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1837                 return
1838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1839             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1840             return
1841
1842     def _real_extract(self, url):
1843         mobj = re.match(self._VALID_URL, url)
1844         if mobj is None:
1845             raise ExtractorError(u'Invalid URL: %s' % url)
1846         video_id = mobj.group('ID')
1847
1848         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1849         webpage = self._download_webpage(url, video_id)
1850
1851         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1852         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1853         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1854         if not m:
1855             raise ExtractorError(u'Cannot parse data')
1856         data = dict(json.loads(m.group(1)))
1857         params_raw = compat_urllib_parse.unquote(data['params'])
1858         params = json.loads(params_raw)
1859         video_data = params['video_data'][0]
1860         video_url = video_data.get('hd_src')
1861         if not video_url:
1862             video_url = video_data['sd_src']
1863         if not video_url:
1864             raise ExtractorError(u'Cannot find video URL')
1865         video_duration = int(video_data['video_duration'])
1866         thumbnail = video_data['thumbnail_src']
1867
1868         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1869         if not m:
1870             raise ExtractorError(u'Cannot find title in webpage')
1871         video_title = unescapeHTML(m.group(1))
1872
1873         info = {
1874             'id': video_id,
1875             'title': video_title,
1876             'url': video_url,
1877             'ext': 'mp4',
1878             'duration': video_duration,
1879             'thumbnail': thumbnail,
1880         }
1881         return [info]
1882
1883
1884 class BlipTVIE(InfoExtractor):
1885     """Information extractor for blip.tv"""
1886
1887     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1888     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1889     IE_NAME = u'blip.tv'
1890
1891     def report_direct_download(self, title):
1892         """Report information extraction."""
1893         self.to_screen(u'%s: Direct download detected' % title)
1894
1895     def _real_extract(self, url):
1896         mobj = re.match(self._VALID_URL, url)
1897         if mobj is None:
1898             raise ExtractorError(u'Invalid URL: %s' % url)
1899
1900         urlp = compat_urllib_parse_urlparse(url)
1901         if urlp.path.startswith('/play/'):
1902             request = compat_urllib_request.Request(url)
1903             response = compat_urllib_request.urlopen(request)
1904             redirecturl = response.geturl()
1905             rurlp = compat_urllib_parse_urlparse(redirecturl)
1906             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1907             url = 'http://blip.tv/a/a-' + file_id
1908             return self._real_extract(url)
1909
1910
1911         if '?' in url:
1912             cchar = '&'
1913         else:
1914             cchar = '?'
1915         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1916         request = compat_urllib_request.Request(json_url)
1917         request.add_header('User-Agent', 'iTunes/10.6.1')
1918         self.report_extraction(mobj.group(1))
1919         info = None
1920         try:
1921             urlh = compat_urllib_request.urlopen(request)
1922             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1923                 basename = url.split('/')[-1]
1924                 title,ext = os.path.splitext(basename)
1925                 title = title.decode('UTF-8')
1926                 ext = ext.replace('.', '')
1927                 self.report_direct_download(title)
1928                 info = {
1929                     'id': title,
1930                     'url': url,
1931                     'uploader': None,
1932                     'upload_date': None,
1933                     'title': title,
1934                     'ext': ext,
1935                     'urlhandle': urlh
1936                 }
1937         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1939         if info is None: # Regular URL
1940             try:
1941                 json_code_bytes = urlh.read()
1942                 json_code = json_code_bytes.decode('utf-8')
1943             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1945
1946             try:
1947                 json_data = json.loads(json_code)
1948                 if 'Post' in json_data:
1949                     data = json_data['Post']
1950                 else:
1951                     data = json_data
1952
1953                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1954                 video_url = data['media']['url']
1955                 umobj = re.match(self._URL_EXT, video_url)
1956                 if umobj is None:
1957                     raise ValueError('Can not determine filename extension')
1958                 ext = umobj.group(1)
1959
1960                 info = {
1961                     'id': data['item_id'],
1962                     'url': video_url,
1963                     'uploader': data['display_name'],
1964                     'upload_date': upload_date,
1965                     'title': data['title'],
1966                     'ext': ext,
1967                     'format': data['media']['mimeType'],
1968                     'thumbnail': data['thumbnailUrl'],
1969                     'description': data['description'],
1970                     'player_url': data['embedUrl'],
1971                     'user_agent': 'iTunes/10.6.1',
1972                 }
1973             except (ValueError,KeyError) as err:
1974                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1975
1976         return [info]
1977
1978
1979 class MyVideoIE(InfoExtractor):
1980     """Information Extractor for myvideo.de."""
1981
1982     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1983     IE_NAME = u'myvideo'
1984
1985 #     Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1986 #     Copyright (C) 2013 Tristan Fischer (sphere@dersphere.de) - GPLv3
1987     def __rc4crypt(self,data, key):
1988         x = 0
1989         box = list(range(256))
1990         for i in list(range(256)):
1991             x = (x + box[i] + ord(key[i % len(key)])) % 256
1992             box[i], box[x] = box[x], box[i]
1993         x = 0
1994         y = 0
1995         out = []
1996         for char in data:
1997             x = (x + 1) % 256
1998             y = (y + box[x]) % 256
1999             box[x], box[y] = box[y], box[x]
2000 #            out.append(chr(ord(char) ^ box[(box[x] + box[y]) % 256]))
2001             out.append(chr(char ^ box[(box[x] + box[y]) % 256]))
2002         return ''.join(out)
2003
2004     def __md5(self,s):
2005         return hashlib.md5(s).hexdigest()
2006
2007     def _real_extract(self,url):
2008         mobj = re.match(self._VALID_URL, url)
2009         if mobj is None:
2010             raise ExtractorError(u'invalid URL: %s' % url)
2011
2012         video_id = mobj.group(1)
2013
2014         GK = (
2015           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2016           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2017           b'TnpsbA0KTVRkbU1tSTRNdz09'
2018         )
2019
2020         # Get video webpage
2021         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2022         webpage = self._download_webpage(webpage_url, video_id)
2023
2024         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2025         if mobj is not None:
2026             self.report_extraction(video_id)
2027             video_url = mobj.group(1) + '.flv'
2028
2029             mobj = re.search('<title>([^<]+)</title>', webpage)
2030             if mobj is None:
2031                 raise ExtractorError(u'Unable to extract title')
2032             video_title = mobj.group(1)
2033
2034             mobj = re.search('[.](.+?)$', video_url)
2035             if mobj is None:
2036                 raise ExtractorError(u'Unable to extract extention')
2037             video_ext = mobj.group(1)
2038
2039             return [{
2040                 'id':       video_id,
2041                 'url':      video_url,
2042                 'uploader': None,
2043                 'upload_date':  None,
2044                 'title':    video_title,
2045                 'ext':      u'flv',
2046             }]
2047
2048         # try encxml
2049         params = {}
2050         encxml = ''
2051         sec = re.search('var flashvars={(.+?)}', webpage).group(1)
2052         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2053             if not a == '_encxml':
2054                 params[a] = b
2055             else:
2056                 encxml = compat_urllib_parse.unquote(b)
2057         if not params.get('domain'):
2058             params['domain'] = 'www.myvideo.de'
2059         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2060         if 'flash_playertype=MTV' in xmldata_url:
2061             self._downloader.report_warning(u'avoiding MTV player')
2062             xmldata_url = (
2063                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2064                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2065             ) % video_id
2066
2067         # get enc data
2068         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2069         enc_data_b = binascii.unhexlify(enc_data)
2070         sk = self.__md5( 
2071             base64.b64decode(base64.b64decode(GK)) + 
2072             self.__md5( 
2073                 str(video_id).encode('utf-8') 
2074             ).encode('utf-8') 
2075         )
2076         dec_data = self.__rc4crypt(enc_data_b, sk)
2077
2078         # extracting infos
2079         self.report_extraction(video_id)
2080
2081         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2082         if mobj is None:
2083             raise ExtractorError(u'unable to extract rtmpurl')
2084         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2085         if 'myvideo2flash' in video_rtmpurl:
2086             self._downloader.report_warning(u'forcing RTMPT ...')
2087             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2088
2089         # extract non rtmp videos
2090         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2091             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2092             if mobj is None:
2093                 raise ExtractorError(u'unable to extract url')
2094             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2095
2096         mobj = re.search('source=\'(.*?)\'', dec_data)
2097         if mobj is None:
2098             raise ExtractorError(u'unable to extract swfobj')
2099         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2100
2101 #        mobj = re.search('path=\'(.*?)\'', dec_data)
2102 #        if mobj is None:
2103 #            raise ExtractorError(u'unable to extract filepath')
2104 #        video_filepath = mobj.group(1)
2105
2106         if not video_file.endswith('f4m'):
2107             ppath, prefix = video_file.split('.')
2108             video_playpath = '%s:%s' % (prefix, ppath)
2109             video_hls_playlist = ''
2110         else:
2111             video_playpath = ''
2112             video_hls_playlist = (
2113                 video_filepath + video_file
2114             ).replace('.f4m', '.m3u8')
2115
2116         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2117         if mobj is None:
2118             raise ExtractorError(u'unable to extract swfobj')
2119         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2120
2121         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2122         if mobj is None:
2123             raise ExtractorError(u'unable to extract title')
2124         video_title = mobj.group(1)
2125
2126         return [{
2127             'id':                 video_id,
2128             'url':                video_rtmpurl,
2129             'tc_url':             video_rtmpurl,
2130             'uploader':           None,
2131             'upload_date':        None,
2132             'title':              video_title,
2133             'ext':                u'flv',
2134             'play_path':          video_playpath,
2135             'video_file':         video_file,
2136 #            'file_path':          video_filepath,
2137             'video_hls_playlist': video_hls_playlist,
2138             'player_url':         video_swfobj,
2139         }]
2140
2141 class ComedyCentralIE(InfoExtractor):
2142     """Information extractor for The Daily Show and Colbert Report """
2143
2144     # urls can be abbreviations like :thedailyshow or :colbert
2145     # urls for episodes like:
2146     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2147     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2148     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2149     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2150                       |(https?://)?(www\.)?
2151                           (?P<showname>thedailyshow|colbertnation)\.com/
2152                          (full-episodes/(?P<episode>.*)|
2153                           (?P<clip>
2154                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2155                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2156                      $"""
2157
2158     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2159
2160     _video_extensions = {
2161         '3500': 'mp4',
2162         '2200': 'mp4',
2163         '1700': 'mp4',
2164         '1200': 'mp4',
2165         '750': 'mp4',
2166         '400': 'mp4',
2167     }
2168     _video_dimensions = {
2169         '3500': '1280x720',
2170         '2200': '960x540',
2171         '1700': '768x432',
2172         '1200': '640x360',
2173         '750': '512x288',
2174         '400': '384x216',
2175     }
2176
2177     @classmethod
2178     def suitable(cls, url):
2179         """Receives a URL and returns True if suitable for this IE."""
2180         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2181
2182     def _print_formats(self, formats):
2183         print('Available formats:')
2184         for x in formats:
2185             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2186
2187
2188     def _real_extract(self, url):
2189         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2190         if mobj is None:
2191             raise ExtractorError(u'Invalid URL: %s' % url)
2192
2193         if mobj.group('shortname'):
2194             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2195                 url = u'http://www.thedailyshow.com/full-episodes/'
2196             else:
2197                 url = u'http://www.colbertnation.com/full-episodes/'
2198             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2199             assert mobj is not None
2200
2201         if mobj.group('clip'):
2202             if mobj.group('showname') == 'thedailyshow':
2203                 epTitle = mobj.group('tdstitle')
2204             else:
2205                 epTitle = mobj.group('cntitle')
2206             dlNewest = False
2207         else:
2208             dlNewest = not mobj.group('episode')
2209             if dlNewest:
2210                 epTitle = mobj.group('showname')
2211             else:
2212                 epTitle = mobj.group('episode')
2213
2214         self.report_extraction(epTitle)
2215         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2216         if dlNewest:
2217             url = htmlHandle.geturl()
2218             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2219             if mobj is None:
2220                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2221             if mobj.group('episode') == '':
2222                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2223             epTitle = mobj.group('episode')
2224
2225         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2226
2227         if len(mMovieParams) == 0:
2228             # The Colbert Report embeds the information in a without
2229             # a URL prefix; so extract the alternate reference
2230             # and then add the URL prefix manually.
2231
2232             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2233             if len(altMovieParams) == 0:
2234                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2235             else:
2236                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2237
2238         uri = mMovieParams[0][1]
2239         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2240         indexXml = self._download_webpage(indexUrl, epTitle,
2241                                           u'Downloading show index',
2242                                           u'unable to download episode index')
2243
2244         results = []
2245
2246         idoc = xml.etree.ElementTree.fromstring(indexXml)
2247         itemEls = idoc.findall('.//item')
2248         for partNum,itemEl in enumerate(itemEls):
2249             mediaId = itemEl.findall('./guid')[0].text
2250             shortMediaId = mediaId.split(':')[-1]
2251             showId = mediaId.split(':')[-2].replace('.com', '')
2252             officialTitle = itemEl.findall('./title')[0].text
2253             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2254
2255             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2256                         compat_urllib_parse.urlencode({'uri': mediaId}))
2257             configXml = self._download_webpage(configUrl, epTitle,
2258                                                u'Downloading configuration for %s' % shortMediaId)
2259
2260             cdoc = xml.etree.ElementTree.fromstring(configXml)
2261             turls = []
2262             for rendition in cdoc.findall('.//rendition'):
2263                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2264                 turls.append(finfo)
2265
2266             if len(turls) == 0:
2267                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2268                 continue
2269
2270             if self._downloader.params.get('listformats', None):
2271                 self._print_formats([i[0] for i in turls])
2272                 return
2273
2274             # For now, just pick the highest bitrate
2275             format,rtmp_video_url = turls[-1]
2276
2277             # Get the format arg from the arg stream
2278             req_format = self._downloader.params.get('format', None)
2279
2280             # Select format if we can find one
2281             for f,v in turls:
2282                 if f == req_format:
2283                     format, rtmp_video_url = f, v
2284                     break
2285
2286             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2287             if not m:
2288                 raise ExtractorError(u'Cannot transform RTMP url')
2289             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2290             video_url = base + m.group('finalid')
2291
2292             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2293             info = {
2294                 'id': shortMediaId,
2295                 'url': video_url,
2296                 'uploader': showId,
2297                 'upload_date': officialDate,
2298                 'title': effTitle,
2299                 'ext': 'mp4',
2300                 'format': format,
2301                 'thumbnail': None,
2302                 'description': officialTitle,
2303             }
2304             results.append(info)
2305
2306         return results
2307
2308
2309 class EscapistIE(InfoExtractor):
2310     """Information extractor for The Escapist """
2311
2312     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2313     IE_NAME = u'escapist'
2314
2315     def _real_extract(self, url):
2316         mobj = re.match(self._VALID_URL, url)
2317         if mobj is None:
2318             raise ExtractorError(u'Invalid URL: %s' % url)
2319         showName = mobj.group('showname')
2320         videoId = mobj.group('episode')
2321
2322         self.report_extraction(showName)
2323         webPage = self._download_webpage(url, showName)
2324
2325         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2326         description = unescapeHTML(descMatch.group(1))
2327         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2328         imgUrl = unescapeHTML(imgMatch.group(1))
2329         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2330         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2331         configUrlMatch = re.search('config=(.*)$', playerUrl)
2332         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2333
2334         configJSON = self._download_webpage(configUrl, showName,
2335                                             u'Downloading configuration',
2336                                             u'unable to download configuration')
2337
2338         # Technically, it's JavaScript, not JSON
2339         configJSON = configJSON.replace("'", '"')
2340
2341         try:
2342             config = json.loads(configJSON)
2343         except (ValueError,) as err:
2344             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2345
2346         playlist = config['playlist']
2347         videoUrl = playlist[1]['url']
2348
2349         info = {
2350             'id': videoId,
2351             'url': videoUrl,
2352             'uploader': showName,
2353             'upload_date': None,
2354             'title': showName,
2355             'ext': 'mp4',
2356             'thumbnail': imgUrl,
2357             'description': description,
2358             'player_url': playerUrl,
2359         }
2360
2361         return [info]
2362
2363 class CollegeHumorIE(InfoExtractor):
2364     """Information extractor for collegehumor.com"""
2365
2366     _WORKING = False
2367     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2368     IE_NAME = u'collegehumor'
2369
2370     def report_manifest(self, video_id):
2371         """Report information extraction."""
2372         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2373
2374     def _real_extract(self, url):
2375         mobj = re.match(self._VALID_URL, url)
2376         if mobj is None:
2377             raise ExtractorError(u'Invalid URL: %s' % url)
2378         video_id = mobj.group('videoid')
2379
2380         info = {
2381             'id': video_id,
2382             'uploader': None,
2383             'upload_date': None,
2384         }
2385
2386         self.report_extraction(video_id)
2387         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2388         try:
2389             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2392
2393         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2394         try:
2395             videoNode = mdoc.findall('./video')[0]
2396             info['description'] = videoNode.findall('./description')[0].text
2397             info['title'] = videoNode.findall('./caption')[0].text
2398             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2399             manifest_url = videoNode.findall('./file')[0].text
2400         except IndexError:
2401             raise ExtractorError(u'Invalid metadata XML file')
2402
2403         manifest_url += '?hdcore=2.10.3'
2404         self.report_manifest(video_id)
2405         try:
2406             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2409
2410         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2411         try:
2412             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2413             node_id = media_node.attrib['url']
2414             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2415         except IndexError as err:
2416             raise ExtractorError(u'Invalid manifest file')
2417
2418         url_pr = compat_urllib_parse_urlparse(manifest_url)
2419         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2420
2421         info['url'] = url
2422         info['ext'] = 'f4f'
2423         return [info]
2424
2425
2426 class XVideosIE(InfoExtractor):
2427     """Information extractor for xvideos.com"""
2428
2429     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2430     IE_NAME = u'xvideos'
2431
2432     def _real_extract(self, url):
2433         mobj = re.match(self._VALID_URL, url)
2434         if mobj is None:
2435             raise ExtractorError(u'Invalid URL: %s' % url)
2436         video_id = mobj.group(1)
2437
2438         webpage = self._download_webpage(url, video_id)
2439
2440         self.report_extraction(video_id)
2441
2442
2443         # Extract video URL
2444         mobj = re.search(r'flv_url=(.+?)&', webpage)
2445         if mobj is None:
2446             raise ExtractorError(u'Unable to extract video url')
2447         video_url = compat_urllib_parse.unquote(mobj.group(1))
2448
2449
2450         # Extract title
2451         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2452         if mobj is None:
2453             raise ExtractorError(u'Unable to extract video title')
2454         video_title = mobj.group(1)
2455
2456
2457         # Extract video thumbnail
2458         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2459         if mobj is None:
2460             raise ExtractorError(u'Unable to extract video thumbnail')
2461         video_thumbnail = mobj.group(0)
2462
2463         info = {
2464             'id': video_id,
2465             'url': video_url,
2466             'uploader': None,
2467             'upload_date': None,
2468             'title': video_title,
2469             'ext': 'flv',
2470             'thumbnail': video_thumbnail,
2471             'description': None,
2472         }
2473
2474         return [info]
2475
2476
2477 class SoundcloudIE(InfoExtractor):
2478     """Information extractor for soundcloud.com
2479        To access the media, the uid of the song and a stream token
2480        must be extracted from the page source and the script must make
2481        a request to media.soundcloud.com/crossdomain.xml. Then
2482        the media can be grabbed by requesting from an url composed
2483        of the stream token and uid
2484      """
2485
2486     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2487     IE_NAME = u'soundcloud'
2488
2489     def report_resolve(self, video_id):
2490         """Report information extraction."""
2491         self.to_screen(u'%s: Resolving id' % video_id)
2492
2493     def _real_extract(self, url):
2494         mobj = re.match(self._VALID_URL, url)
2495         if mobj is None:
2496             raise ExtractorError(u'Invalid URL: %s' % url)
2497
2498         # extract uploader (which is in the url)
2499         uploader = mobj.group(1)
2500         # extract simple title (uploader + slug of song title)
2501         slug_title =  mobj.group(2)
2502         simple_title = uploader + u'-' + slug_title
2503         full_title = '%s/%s' % (uploader, slug_title)
2504
2505         self.report_resolve(full_title)
2506
2507         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2508         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2509         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2510
2511         info = json.loads(info_json)
2512         video_id = info['id']
2513         self.report_extraction(full_title)
2514
2515         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2516         stream_json = self._download_webpage(streams_url, full_title,
2517                                              u'Downloading stream definitions',
2518                                              u'unable to download stream definitions')
2519
2520         streams = json.loads(stream_json)
2521         mediaURL = streams['http_mp3_128_url']
2522         upload_date = unified_strdate(info['created_at'])
2523
2524         return [{
2525             'id':       info['id'],
2526             'url':      mediaURL,
2527             'uploader': info['user']['username'],
2528             'upload_date': upload_date,
2529             'title':    info['title'],
2530             'ext':      u'mp3',
2531             'description': info['description'],
2532         }]
2533
2534 class SoundcloudSetIE(InfoExtractor):
2535     """Information extractor for soundcloud.com sets
2536        To access the media, the uid of the song and a stream token
2537        must be extracted from the page source and the script must make
2538        a request to media.soundcloud.com/crossdomain.xml. Then
2539        the media can be grabbed by requesting from an url composed
2540        of the stream token and uid
2541      """
2542
2543     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2544     IE_NAME = u'soundcloud:set'
2545
2546     def report_resolve(self, video_id):
2547         """Report information extraction."""
2548         self.to_screen(u'%s: Resolving id' % video_id)
2549
2550     def _real_extract(self, url):
2551         mobj = re.match(self._VALID_URL, url)
2552         if mobj is None:
2553             raise ExtractorError(u'Invalid URL: %s' % url)
2554
2555         # extract uploader (which is in the url)
2556         uploader = mobj.group(1)
2557         # extract simple title (uploader + slug of song title)
2558         slug_title =  mobj.group(2)
2559         simple_title = uploader + u'-' + slug_title
2560         full_title = '%s/sets/%s' % (uploader, slug_title)
2561
2562         self.report_resolve(full_title)
2563
2564         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2565         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2566         info_json = self._download_webpage(resolv_url, full_title)
2567
2568         videos = []
2569         info = json.loads(info_json)
2570         if 'errors' in info:
2571             for err in info['errors']:
2572                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2573             return
2574
2575         self.report_extraction(full_title)
2576         for track in info['tracks']:
2577             video_id = track['id']
2578
2579             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2580             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2581
2582             self.report_extraction(video_id)
2583             streams = json.loads(stream_json)
2584             mediaURL = streams['http_mp3_128_url']
2585
2586             videos.append({
2587                 'id':       video_id,
2588                 'url':      mediaURL,
2589                 'uploader': track['user']['username'],
2590                 'upload_date':  unified_strdate(track['created_at']),
2591                 'title':    track['title'],
2592                 'ext':      u'mp3',
2593                 'description': track['description'],
2594             })
2595         return videos
2596
2597
2598 class InfoQIE(InfoExtractor):
2599     """Information extractor for infoq.com"""
2600     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2601
2602     def _real_extract(self, url):
2603         mobj = re.match(self._VALID_URL, url)
2604         if mobj is None:
2605             raise ExtractorError(u'Invalid URL: %s' % url)
2606
2607         webpage = self._download_webpage(url, video_id=url)
2608         self.report_extraction(url)
2609
2610         # Extract video URL
2611         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2612         if mobj is None:
2613             raise ExtractorError(u'Unable to extract video url')
2614         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2615         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2616
2617         # Extract title
2618         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2619         if mobj is None:
2620             raise ExtractorError(u'Unable to extract video title')
2621         video_title = mobj.group(1)
2622
2623         # Extract description
2624         video_description = u'No description available.'
2625         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2626         if mobj is not None:
2627             video_description = mobj.group(1)
2628
2629         video_filename = video_url.split('/')[-1]
2630         video_id, extension = video_filename.split('.')
2631
2632         info = {
2633             'id': video_id,
2634             'url': video_url,
2635             'uploader': None,
2636             'upload_date': None,
2637             'title': video_title,
2638             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2639             'thumbnail': None,
2640             'description': video_description,
2641         }
2642
2643         return [info]
2644
2645 class MixcloudIE(InfoExtractor):
2646     """Information extractor for www.mixcloud.com"""
2647
2648     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2649     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2650     IE_NAME = u'mixcloud'
2651
2652     def report_download_json(self, file_id):
2653         """Report JSON download."""
2654         self.to_screen(u'Downloading json')
2655
2656     def get_urls(self, jsonData, fmt, bitrate='best'):
2657         """Get urls from 'audio_formats' section in json"""
2658         file_url = None
2659         try:
2660             bitrate_list = jsonData[fmt]
2661             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2662                 bitrate = max(bitrate_list) # select highest
2663
2664             url_list = jsonData[fmt][bitrate]
2665         except TypeError: # we have no bitrate info.
2666             url_list = jsonData[fmt]
2667         return url_list
2668
2669     def check_urls(self, url_list):
2670         """Returns 1st active url from list"""
2671         for url in url_list:
2672             try:
2673                 compat_urllib_request.urlopen(url)
2674                 return url
2675             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676                 url = None
2677
2678         return None
2679
2680     def _print_formats(self, formats):
2681         print('Available formats:')
2682         for fmt in formats.keys():
2683             for b in formats[fmt]:
2684                 try:
2685                     ext = formats[fmt][b][0]
2686                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2687                 except TypeError: # we have no bitrate info
2688                     ext = formats[fmt][0]
2689                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2690                     break
2691
2692     def _real_extract(self, url):
2693         mobj = re.match(self._VALID_URL, url)
2694         if mobj is None:
2695             raise ExtractorError(u'Invalid URL: %s' % url)
2696         # extract uploader & filename from url
2697         uploader = mobj.group(1).decode('utf-8')
2698         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2699
2700         # construct API request
2701         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2702         # retrieve .json file with links to files
2703         request = compat_urllib_request.Request(file_url)
2704         try:
2705             self.report_download_json(file_url)
2706             jsonData = compat_urllib_request.urlopen(request).read()
2707         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2708             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2709
2710         # parse JSON
2711         json_data = json.loads(jsonData)
2712         player_url = json_data['player_swf_url']
2713         formats = dict(json_data['audio_formats'])
2714
2715         req_format = self._downloader.params.get('format', None)
2716         bitrate = None
2717
2718         if self._downloader.params.get('listformats', None):
2719             self._print_formats(formats)
2720             return
2721
2722         if req_format is None or req_format == 'best':
2723             for format_param in formats.keys():
2724                 url_list = self.get_urls(formats, format_param)
2725                 # check urls
2726                 file_url = self.check_urls(url_list)
2727                 if file_url is not None:
2728                     break # got it!
2729         else:
2730             if req_format not in formats:
2731                 raise ExtractorError(u'Format is not available')
2732
2733             url_list = self.get_urls(formats, req_format)
2734             file_url = self.check_urls(url_list)
2735             format_param = req_format
2736
2737         return [{
2738             'id': file_id.decode('utf-8'),
2739             'url': file_url.decode('utf-8'),
2740             'uploader': uploader.decode('utf-8'),
2741             'upload_date': None,
2742             'title': json_data['name'],
2743             'ext': file_url.split('.')[-1].decode('utf-8'),
2744             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2745             'thumbnail': json_data['thumbnail_url'],
2746             'description': json_data['description'],
2747             'player_url': player_url.decode('utf-8'),
2748         }]
2749
2750 class StanfordOpenClassroomIE(InfoExtractor):
2751     """Information extractor for Stanford's Open ClassRoom"""
2752
2753     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2754     IE_NAME = u'stanfordoc'
2755
2756     def _real_extract(self, url):
2757         mobj = re.match(self._VALID_URL, url)
2758         if mobj is None:
2759             raise ExtractorError(u'Invalid URL: %s' % url)
2760
2761         if mobj.group('course') and mobj.group('video'): # A specific video
2762             course = mobj.group('course')
2763             video = mobj.group('video')
2764             info = {
2765                 'id': course + '_' + video,
2766                 'uploader': None,
2767                 'upload_date': None,
2768             }
2769
2770             self.report_extraction(info['id'])
2771             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2772             xmlUrl = baseUrl + video + '.xml'
2773             try:
2774                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2775             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2777             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2778             try:
2779                 info['title'] = mdoc.findall('./title')[0].text
2780                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2781             except IndexError:
2782                 raise ExtractorError(u'Invalid metadata XML file')
2783             info['ext'] = info['url'].rpartition('.')[2]
2784             return [info]
2785         elif mobj.group('course'): # A course page
2786             course = mobj.group('course')
2787             info = {
2788                 'id': course,
2789                 'type': 'playlist',
2790                 'uploader': None,
2791                 'upload_date': None,
2792             }
2793
2794             coursepage = self._download_webpage(url, info['id'],
2795                                         note='Downloading course info page',
2796                                         errnote='Unable to download course info page')
2797
2798             m = re.search('<h1>([^<]+)</h1>', coursepage)
2799             if m:
2800                 info['title'] = unescapeHTML(m.group(1))
2801             else:
2802                 info['title'] = info['id']
2803
2804             m = re.search('<description>([^<]+)</description>', coursepage)
2805             if m:
2806                 info['description'] = unescapeHTML(m.group(1))
2807
2808             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2809             info['list'] = [
2810                 {
2811                     'type': 'reference',
2812                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2813                 }
2814                     for vpage in links]
2815             results = []
2816             for entry in info['list']:
2817                 assert entry['type'] == 'reference'
2818                 results += self.extract(entry['url'])
2819             return results
2820         else: # Root page
2821             info = {
2822                 'id': 'Stanford OpenClassroom',
2823                 'type': 'playlist',
2824                 'uploader': None,
2825                 'upload_date': None,
2826             }
2827
2828             self.report_download_webpage(info['id'])
2829             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2830             try:
2831                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2832             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2833                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2834
2835             info['title'] = info['id']
2836
2837             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2838             info['list'] = [
2839                 {
2840                     'type': 'reference',
2841                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2842                 }
2843                     for cpage in links]
2844
2845             results = []
2846             for entry in info['list']:
2847                 assert entry['type'] == 'reference'
2848                 results += self.extract(entry['url'])
2849             return results
2850
2851 class MTVIE(InfoExtractor):
2852     """Information extractor for MTV.com"""
2853
2854     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2855     IE_NAME = u'mtv'
2856
2857     def _real_extract(self, url):
2858         mobj = re.match(self._VALID_URL, url)
2859         if mobj is None:
2860             raise ExtractorError(u'Invalid URL: %s' % url)
2861         if not mobj.group('proto'):
2862             url = 'http://' + url
2863         video_id = mobj.group('videoid')
2864
2865         webpage = self._download_webpage(url, video_id)
2866
2867         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2868         if mobj is None:
2869             raise ExtractorError(u'Unable to extract song name')
2870         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2871         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2872         if mobj is None:
2873             raise ExtractorError(u'Unable to extract performer')
2874         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2875         video_title = performer + ' - ' + song_name
2876
2877         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2878         if mobj is None:
2879             raise ExtractorError(u'Unable to mtvn_uri')
2880         mtvn_uri = mobj.group(1)
2881
2882         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2883         if mobj is None:
2884             raise ExtractorError(u'Unable to extract content id')
2885         content_id = mobj.group(1)
2886
2887         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2888         self.report_extraction(video_id)
2889         request = compat_urllib_request.Request(videogen_url)
2890         try:
2891             metadataXml = compat_urllib_request.urlopen(request).read()
2892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2894
2895         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2896         renditions = mdoc.findall('.//rendition')
2897
2898         # For now, always pick the highest quality.
2899         rendition = renditions[-1]
2900
2901         try:
2902             _,_,ext = rendition.attrib['type'].partition('/')
2903             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2904             video_url = rendition.find('./src').text
2905         except KeyError:
2906             raise ExtractorError('Invalid rendition field.')
2907
2908         info = {
2909             'id': video_id,
2910             'url': video_url,
2911             'uploader': performer,
2912             'upload_date': None,
2913             'title': video_title,
2914             'ext': ext,
2915             'format': format,
2916         }
2917
2918         return [info]
2919
2920
2921 class YoukuIE(InfoExtractor):
2922     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2923
2924     def _gen_sid(self):
2925         nowTime = int(time.time() * 1000)
2926         random1 = random.randint(1000,1998)
2927         random2 = random.randint(1000,9999)
2928
2929         return "%d%d%d" %(nowTime,random1,random2)
2930
2931     def _get_file_ID_mix_string(self, seed):
2932         mixed = []
2933         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2934         seed = float(seed)
2935         for i in range(len(source)):
2936             seed  =  (seed * 211 + 30031 ) % 65536
2937             index  =  math.floor(seed / 65536 * len(source) )
2938             mixed.append(source[int(index)])
2939             source.remove(source[int(index)])
2940         #return ''.join(mixed)
2941         return mixed
2942
2943     def _get_file_id(self, fileId, seed):
2944         mixed = self._get_file_ID_mix_string(seed)
2945         ids = fileId.split('*')
2946         realId = []
2947         for ch in ids:
2948             if ch:
2949                 realId.append(mixed[int(ch)])
2950         return ''.join(realId)
2951
2952     def _real_extract(self, url):
2953         mobj = re.match(self._VALID_URL, url)
2954         if mobj is None:
2955             raise ExtractorError(u'Invalid URL: %s' % url)
2956         video_id = mobj.group('ID')
2957
2958         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2959
2960         jsondata = self._download_webpage(info_url, video_id)
2961
2962         self.report_extraction(video_id)
2963         try:
2964             config = json.loads(jsondata)
2965
2966             video_title =  config['data'][0]['title']
2967             seed = config['data'][0]['seed']
2968
2969             format = self._downloader.params.get('format', None)
2970             supported_format = list(config['data'][0]['streamfileids'].keys())
2971
2972             if format is None or format == 'best':
2973                 if 'hd2' in supported_format:
2974                     format = 'hd2'
2975                 else:
2976                     format = 'flv'
2977                 ext = u'flv'
2978             elif format == 'worst':
2979                 format = 'mp4'
2980                 ext = u'mp4'
2981             else:
2982                 format = 'flv'
2983                 ext = u'flv'
2984
2985
2986             fileid = config['data'][0]['streamfileids'][format]
2987             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2988         except (UnicodeDecodeError, ValueError, KeyError):
2989             raise ExtractorError(u'Unable to extract info section')
2990
2991         files_info=[]
2992         sid = self._gen_sid()
2993         fileid = self._get_file_id(fileid, seed)
2994
2995         #column 8,9 of fileid represent the segment number
2996         #fileid[7:9] should be changed
2997         for index, key in enumerate(keys):
2998
2999             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3000             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3001
3002             info = {
3003                 'id': '%s_part%02d' % (video_id, index),
3004                 'url': download_url,
3005                 'uploader': None,
3006                 'upload_date': None,
3007                 'title': video_title,
3008                 'ext': ext,
3009             }
3010             files_info.append(info)
3011
3012         return files_info
3013
3014
3015 class XNXXIE(InfoExtractor):
3016     """Information extractor for xnxx.com"""
3017
3018     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3019     IE_NAME = u'xnxx'
3020     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3021     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3022     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3023
3024     def _real_extract(self, url):
3025         mobj = re.match(self._VALID_URL, url)
3026         if mobj is None:
3027             raise ExtractorError(u'Invalid URL: %s' % url)
3028         video_id = mobj.group(1)
3029
3030         # Get webpage content
3031         webpage = self._download_webpage(url, video_id)
3032
3033         result = re.search(self.VIDEO_URL_RE, webpage)
3034         if result is None:
3035             raise ExtractorError(u'Unable to extract video url')
3036         video_url = compat_urllib_parse.unquote(result.group(1))
3037
3038         result = re.search(self.VIDEO_TITLE_RE, webpage)
3039         if result is None:
3040             raise ExtractorError(u'Unable to extract video title')
3041         video_title = result.group(1)
3042
3043         result = re.search(self.VIDEO_THUMB_RE, webpage)
3044         if result is None:
3045             raise ExtractorError(u'Unable to extract video thumbnail')
3046         video_thumbnail = result.group(1)
3047
3048         return [{
3049             'id': video_id,
3050             'url': video_url,
3051             'uploader': None,
3052             'upload_date': None,
3053             'title': video_title,
3054             'ext': 'flv',
3055             'thumbnail': video_thumbnail,
3056             'description': None,
3057         }]
3058
3059
3060 class GooglePlusIE(InfoExtractor):
3061     """Information extractor for plus.google.com."""
3062
3063     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3064     IE_NAME = u'plus.google'
3065
3066     def report_extract_entry(self, url):
3067         """Report downloading extry"""
3068         self.to_screen(u'Downloading entry: %s' % url)
3069
3070     def report_date(self, upload_date):
3071         """Report downloading extry"""
3072         self.to_screen(u'Entry date: %s' % upload_date)
3073
3074     def report_uploader(self, uploader):
3075         """Report downloading extry"""
3076         self.to_screen(u'Uploader: %s' % uploader)
3077
3078     def report_title(self, video_title):
3079         """Report downloading extry"""
3080         self.to_screen(u'Title: %s' % video_title)
3081
3082     def report_extract_vid_page(self, video_page):
3083         """Report information extraction."""
3084         self.to_screen(u'Extracting video page: %s' % video_page)
3085
3086     def _real_extract(self, url):
3087         # Extract id from URL
3088         mobj = re.match(self._VALID_URL, url)
3089         if mobj is None:
3090             raise ExtractorError(u'Invalid URL: %s' % url)
3091
3092         post_url = mobj.group(0)
3093         video_id = mobj.group(1)
3094
3095         video_extension = 'flv'
3096
3097         # Step 1, Retrieve post webpage to extract further information
3098         self.report_extract_entry(post_url)
3099         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3100
3101         # Extract update date
3102         upload_date = None
3103         pattern = 'title="Timestamp">(.*?)</a>'
3104         mobj = re.search(pattern, webpage)
3105         if mobj:
3106             upload_date = mobj.group(1)
3107             # Convert timestring to a format suitable for filename
3108             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3109             upload_date = upload_date.strftime('%Y%m%d')
3110         self.report_date(upload_date)
3111
3112         # Extract uploader
3113         uploader = None
3114         pattern = r'rel\="author".*?>(.*?)</a>'
3115         mobj = re.search(pattern, webpage)
3116         if mobj:
3117             uploader = mobj.group(1)
3118         self.report_uploader(uploader)
3119
3120         # Extract title
3121         # Get the first line for title
3122         video_title = u'NA'
3123         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3124         mobj = re.search(pattern, webpage)
3125         if mobj:
3126             video_title = mobj.group(1)
3127         self.report_title(video_title)
3128
3129         # Step 2, Stimulate clicking the image box to launch video
3130         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3131         mobj = re.search(pattern, webpage)
3132         if mobj is None:
3133             raise ExtractorError(u'Unable to extract video page URL')
3134
3135         video_page = mobj.group(1)
3136         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3137         self.report_extract_vid_page(video_page)
3138
3139
3140         # Extract video links on video page
3141         """Extract video links of all sizes"""
3142         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3143         mobj = re.findall(pattern, webpage)
3144         if len(mobj) == 0:
3145             raise ExtractorError(u'Unable to extract video links')
3146
3147         # Sort in resolution
3148         links = sorted(mobj)
3149
3150         # Choose the lowest of the sort, i.e. highest resolution
3151         video_url = links[-1]
3152         # Only get the url. The resolution part in the tuple has no use anymore
3153         video_url = video_url[-1]
3154         # Treat escaped \u0026 style hex
3155         try:
3156             video_url = video_url.decode("unicode_escape")
3157         except AttributeError: # Python 3
3158             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3159
3160
3161         return [{
3162             'id':       video_id,
3163             'url':      video_url,
3164             'uploader': uploader,
3165             'upload_date':  upload_date,
3166             'title':    video_title,
3167             'ext':      video_extension,
3168         }]
3169
3170 class NBAIE(InfoExtractor):
3171     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3172     IE_NAME = u'nba'
3173
3174     def _real_extract(self, url):
3175         mobj = re.match(self._VALID_URL, url)
3176         if mobj is None:
3177             raise ExtractorError(u'Invalid URL: %s' % url)
3178
3179         video_id = mobj.group(1)
3180         if video_id.endswith('/index.html'):
3181             video_id = video_id[:-len('/index.html')]
3182
3183         webpage = self._download_webpage(url, video_id)
3184
3185         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3186         def _findProp(rexp, default=None):
3187             m = re.search(rexp, webpage)
3188             if m:
3189                 return unescapeHTML(m.group(1))
3190             else:
3191                 return default
3192
3193         shortened_video_id = video_id.rpartition('/')[2]
3194         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3195         info = {
3196             'id': shortened_video_id,
3197             'url': video_url,
3198             'ext': 'mp4',
3199             'title': title,
3200             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3201             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3202         }
3203         return [info]
3204
3205 class JustinTVIE(InfoExtractor):
3206     """Information extractor for justin.tv and twitch.tv"""
3207     # TODO: One broadcast may be split into multiple videos. The key
3208     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3209     # starts at 1 and increases. Can we treat all parts as one video?
3210
3211     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3212         (?:
3213             (?P<channelid>[^/]+)|
3214             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3215             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3216         )
3217         /?(?:\#.*)?$
3218         """
3219     _JUSTIN_PAGE_LIMIT = 100
3220     IE_NAME = u'justin.tv'
3221
3222     def report_download_page(self, channel, offset):
3223         """Report attempt to download a single page of videos."""
3224         self.to_screen(u'%s: Downloading video information from %d to %d' %
3225                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3226
3227     # Return count of items, list of *valid* items
3228     def _parse_page(self, url, video_id):
3229         webpage = self._download_webpage(url, video_id,
3230                                          u'Downloading video info JSON',
3231                                          u'unable to download video info JSON')
3232
3233         response = json.loads(webpage)
3234         if type(response) != list:
3235             error_text = response.get('error', 'unknown error')
3236             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3237         info = []
3238         for clip in response:
3239             video_url = clip['video_file_url']
3240             if video_url:
3241                 video_extension = os.path.splitext(video_url)[1][1:]
3242                 video_date = re.sub('-', '', clip['start_time'][:10])
3243                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3244                 video_id = clip['id']
3245                 video_title = clip.get('title', video_id)
3246                 info.append({
3247                     'id': video_id,
3248                     'url': video_url,
3249                     'title': video_title,
3250                     'uploader': clip.get('channel_name', video_uploader_id),
3251                     'uploader_id': video_uploader_id,
3252                     'upload_date': video_date,
3253                     'ext': video_extension,
3254                 })
3255         return (len(response), info)
3256
3257     def _real_extract(self, url):
3258         mobj = re.match(self._VALID_URL, url)
3259         if mobj is None:
3260             raise ExtractorError(u'invalid URL: %s' % url)
3261
3262         api_base = 'http://api.justin.tv'
3263         paged = False
3264         if mobj.group('channelid'):
3265             paged = True
3266             video_id = mobj.group('channelid')
3267             api = api_base + '/channel/archives/%s.json' % video_id
3268         elif mobj.group('chapterid'):
3269             chapter_id = mobj.group('chapterid')
3270
3271             webpage = self._download_webpage(url, chapter_id)
3272             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3273             if not m:
3274                 raise ExtractorError(u'Cannot find archive of a chapter')
3275             archive_id = m.group(1)
3276
3277             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3278             chapter_info_xml = self._download_webpage(api, chapter_id,
3279                                              note=u'Downloading chapter information',
3280                                              errnote=u'Chapter information download failed')
3281             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3282             for a in doc.findall('.//archive'):
3283                 if archive_id == a.find('./id').text:
3284                     break
3285             else:
3286                 raise ExtractorError(u'Could not find chapter in chapter information')
3287
3288             video_url = a.find('./video_file_url').text
3289             video_ext = video_url.rpartition('.')[2] or u'flv'
3290
3291             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3292             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3293                                    note='Downloading chapter metadata',
3294                                    errnote='Download of chapter metadata failed')
3295             chapter_info = json.loads(chapter_info_json)
3296
3297             bracket_start = int(doc.find('.//bracket_start').text)
3298             bracket_end = int(doc.find('.//bracket_end').text)
3299
3300             # TODO determine start (and probably fix up file)
3301             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3302             #video_url += u'?start=' + TODO:start_timestamp
3303             # bracket_start is 13290, but we want 51670615
3304             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3305                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3306
3307             info = {
3308                 'id': u'c' + chapter_id,
3309                 'url': video_url,
3310                 'ext': video_ext,
3311                 'title': chapter_info['title'],
3312                 'thumbnail': chapter_info['preview'],
3313                 'description': chapter_info['description'],
3314                 'uploader': chapter_info['channel']['display_name'],
3315                 'uploader_id': chapter_info['channel']['name'],
3316             }
3317             return [info]
3318         else:
3319             video_id = mobj.group('videoid')
3320             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3321
3322         self.report_extraction(video_id)
3323
3324         info = []
3325         offset = 0
3326         limit = self._JUSTIN_PAGE_LIMIT
3327         while True:
3328             if paged:
3329                 self.report_download_page(video_id, offset)
3330             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3331             page_count, page_info = self._parse_page(page_url, video_id)
3332             info.extend(page_info)
3333             if not paged or page_count != limit:
3334                 break
3335             offset += limit
3336         return info
3337
3338 class FunnyOrDieIE(InfoExtractor):
3339     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3340
3341     def _real_extract(self, url):
3342         mobj = re.match(self._VALID_URL, url)
3343         if mobj is None:
3344             raise ExtractorError(u'invalid URL: %s' % url)
3345
3346         video_id = mobj.group('id')
3347         webpage = self._download_webpage(url, video_id)
3348
3349         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3350         if not m:
3351             raise ExtractorError(u'Unable to find video information')
3352         video_url = unescapeHTML(m.group('url'))
3353
3354         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3355         if not m:
3356             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3357             if not m:
3358                 raise ExtractorError(u'Cannot find video title')
3359         title = clean_html(m.group('title'))
3360
3361         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3362         if m:
3363             desc = unescapeHTML(m.group('desc'))
3364         else:
3365             desc = None
3366
3367         info = {
3368             'id': video_id,
3369             'url': video_url,
3370             'ext': 'mp4',
3371             'title': title,
3372             'description': desc,
3373         }
3374         return [info]
3375
3376 class SteamIE(InfoExtractor):
3377     _VALID_URL = r"""http://store\.steampowered\.com/
3378                 (agecheck/)?
3379                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3380                 (?P<gameID>\d+)/?
3381                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3382                 """
3383
3384     @classmethod
3385     def suitable(cls, url):
3386         """Receives a URL and returns True if suitable for this IE."""
3387         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3388
3389     def _real_extract(self, url):
3390         m = re.match(self._VALID_URL, url, re.VERBOSE)
3391         gameID = m.group('gameID')
3392         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3393         self.report_age_confirmation()
3394         webpage = self._download_webpage(videourl, gameID)
3395         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3396         
3397         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3398         mweb = re.finditer(urlRE, webpage)
3399         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3400         titles = re.finditer(namesRE, webpage)
3401         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3402         thumbs = re.finditer(thumbsRE, webpage)
3403         videos = []
3404         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3405             video_id = vid.group('videoID')
3406             title = vtitle.group('videoName')
3407             video_url = vid.group('videoURL')
3408             video_thumb = thumb.group('thumbnail')
3409             if not video_url:
3410                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3411             info = {
3412                 'id':video_id,
3413                 'url':video_url,
3414                 'ext': 'flv',
3415                 'title': unescapeHTML(title),
3416                 'thumbnail': video_thumb
3417                   }
3418             videos.append(info)
3419         return [self.playlist_result(videos, gameID, game_title)]
3420
3421 class UstreamIE(InfoExtractor):
3422     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3423     IE_NAME = u'ustream'
3424
3425     def _real_extract(self, url):
3426         m = re.match(self._VALID_URL, url)
3427         video_id = m.group('videoID')
3428         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3429         webpage = self._download_webpage(url, video_id)
3430         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3431         title = m.group('title')
3432         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3433         uploader = m.group('uploader')
3434         info = {
3435                 'id':video_id,
3436                 'url':video_url,
3437                 'ext': 'flv',
3438                 'title': title,
3439                 'uploader': uploader
3440                   }
3441         return [info]
3442
3443 class WorldStarHipHopIE(InfoExtractor):
3444     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3445     IE_NAME = u'WorldStarHipHop'
3446
3447     def _real_extract(self, url):
3448         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3449
3450         m = re.match(self._VALID_URL, url)
3451         video_id = m.group('id')
3452
3453         webpage_src = self._download_webpage(url, video_id) 
3454
3455         mobj = re.search(_src_url, webpage_src)
3456
3457         if mobj is not None:
3458             video_url = mobj.group(1)
3459             if 'mp4' in video_url:
3460                 ext = 'mp4'
3461             else:
3462                 ext = 'flv'
3463         else:
3464             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3465
3466         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3467
3468         if mobj is None:
3469             raise ExtractorError(u'Cannot determine title')
3470         title = mobj.group(1)
3471
3472         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3473         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3474         if mobj is not None:
3475             thumbnail = mobj.group(1)
3476         else:
3477             _title = r"""candytitles.*>(.*)</span>"""
3478             mobj = re.search(_title, webpage_src)
3479             if mobj is not None:
3480                 title = mobj.group(1)
3481             thumbnail = None
3482
3483         results = [{
3484                     'id': video_id,
3485                     'url' : video_url,
3486                     'title' : title,
3487                     'thumbnail' : thumbnail,
3488                     'ext' : ext,
3489                     }]
3490         return results
3491
3492 class RBMARadioIE(InfoExtractor):
3493     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3494
3495     def _real_extract(self, url):
3496         m = re.match(self._VALID_URL, url)
3497         video_id = m.group('videoID')
3498
3499         webpage = self._download_webpage(url, video_id)
3500         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3501         if not m:
3502             raise ExtractorError(u'Cannot find metadata')
3503         json_data = m.group(1)
3504
3505         try:
3506             data = json.loads(json_data)
3507         except ValueError as e:
3508             raise ExtractorError(u'Invalid JSON: ' + str(e))
3509
3510         video_url = data['akamai_url'] + '&cbr=256'
3511         url_parts = compat_urllib_parse_urlparse(video_url)
3512         video_ext = url_parts.path.rpartition('.')[2]
3513         info = {
3514                 'id': video_id,
3515                 'url': video_url,
3516                 'ext': video_ext,
3517                 'title': data['title'],
3518                 'description': data.get('teaser_text'),
3519                 'location': data.get('country_of_origin'),
3520                 'uploader': data.get('host', {}).get('name'),
3521                 'uploader_id': data.get('host', {}).get('slug'),
3522                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3523                 'duration': data.get('duration'),
3524         }
3525         return [info]
3526
3527
3528 class YouPornIE(InfoExtractor):
3529     """Information extractor for youporn.com."""
3530     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3531
3532     def _print_formats(self, formats):
3533         """Print all available formats"""
3534         print(u'Available formats:')
3535         print(u'ext\t\tformat')
3536         print(u'---------------------------------')
3537         for format in formats:
3538             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3539
3540     def _specific(self, req_format, formats):
3541         for x in formats:
3542             if(x["format"]==req_format):
3543                 return x
3544         return None
3545
3546     def _real_extract(self, url):
3547         mobj = re.match(self._VALID_URL, url)
3548         if mobj is None:
3549             raise ExtractorError(u'Invalid URL: %s' % url)
3550
3551         video_id = mobj.group('videoid')
3552
3553         req = compat_urllib_request.Request(url)
3554         req.add_header('Cookie', 'age_verified=1')
3555         webpage = self._download_webpage(req, video_id)
3556
3557         # Get the video title
3558         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3559         if result is None:
3560             raise ExtractorError(u'Unable to extract video title')
3561         video_title = result.group('title').strip()
3562
3563         # Get the video date
3564         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3565         if result is None:
3566             self._downloader.report_warning(u'unable to extract video date')
3567             upload_date = None
3568         else:
3569             upload_date = unified_strdate(result.group('date').strip())
3570
3571         # Get the video uploader
3572         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3573         if result is None:
3574             self._downloader.report_warning(u'unable to extract uploader')
3575             video_uploader = None
3576         else:
3577             video_uploader = result.group('uploader').strip()
3578             video_uploader = clean_html( video_uploader )
3579
3580         # Get all of the formats available
3581         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3582         result = re.search(DOWNLOAD_LIST_RE, webpage)
3583         if result is None:
3584             raise ExtractorError(u'Unable to extract download list')
3585         download_list_html = result.group('download_list').strip()
3586
3587         # Get all of the links from the page
3588         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3589         links = re.findall(LINK_RE, download_list_html)
3590         if(len(links) == 0):
3591             raise ExtractorError(u'ERROR: no known formats available for video')
3592
3593         self.to_screen(u'Links found: %d' % len(links))
3594
3595         formats = []
3596         for link in links:
3597
3598             # A link looks like this:
3599             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3600             # A path looks like this:
3601             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3602             video_url = unescapeHTML( link )
3603             path = compat_urllib_parse_urlparse( video_url ).path
3604             extension = os.path.splitext( path )[1][1:]
3605             format = path.split('/')[4].split('_')[:2]
3606             size = format[0]
3607             bitrate = format[1]
3608             format = "-".join( format )
3609             title = u'%s-%s-%s' % (video_title, size, bitrate)
3610
3611             formats.append({
3612                 'id': video_id,
3613                 'url': video_url,
3614                 'uploader': video_uploader,
3615                 'upload_date': upload_date,
3616                 'title': title,
3617                 'ext': extension,
3618                 'format': format,
3619                 'thumbnail': None,
3620                 'description': None,
3621                 'player_url': None
3622             })
3623
3624         if self._downloader.params.get('listformats', None):
3625             self._print_formats(formats)
3626             return
3627
3628         req_format = self._downloader.params.get('format', None)
3629         self.to_screen(u'Format: %s' % req_format)
3630
3631         if req_format is None or req_format == 'best':
3632             return [formats[0]]
3633         elif req_format == 'worst':
3634             return [formats[-1]]
3635         elif req_format in ('-1', 'all'):
3636             return formats
3637         else:
3638             format = self._specific( req_format, formats )
3639             if result is None:
3640                 raise ExtractorError(u'Requested format not available')
3641             return [format]
3642
3643
3644
3645 class PornotubeIE(InfoExtractor):
3646     """Information extractor for pornotube.com."""
3647     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3648
3649     def _real_extract(self, url):
3650         mobj = re.match(self._VALID_URL, url)
3651         if mobj is None:
3652             raise ExtractorError(u'Invalid URL: %s' % url)
3653
3654         video_id = mobj.group('videoid')
3655         video_title = mobj.group('title')
3656
3657         # Get webpage content
3658         webpage = self._download_webpage(url, video_id)
3659
3660         # Get the video URL
3661         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3662         result = re.search(VIDEO_URL_RE, webpage)
3663         if result is None:
3664             raise ExtractorError(u'Unable to extract video url')
3665         video_url = compat_urllib_parse.unquote(result.group('url'))
3666
3667         #Get the uploaded date
3668         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3669         result = re.search(VIDEO_UPLOADED_RE, webpage)
3670         if result is None:
3671             raise ExtractorError(u'Unable to extract video title')
3672         upload_date = unified_strdate(result.group('date'))
3673
3674         info = {'id': video_id,
3675                 'url': video_url,
3676                 'uploader': None,
3677                 'upload_date': upload_date,
3678                 'title': video_title,
3679                 'ext': 'flv',
3680                 'format': 'flv'}
3681
3682         return [info]
3683
3684 class YouJizzIE(InfoExtractor):
3685     """Information extractor for youjizz.com."""
3686     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3687
3688     def _real_extract(self, url):
3689         mobj = re.match(self._VALID_URL, url)
3690         if mobj is None:
3691             raise ExtractorError(u'Invalid URL: %s' % url)
3692
3693         video_id = mobj.group('videoid')
3694
3695         # Get webpage content
3696         webpage = self._download_webpage(url, video_id)
3697
3698         # Get the video title
3699         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3700         if result is None:
3701             raise ExtractorError(u'ERROR: unable to extract video title')
3702         video_title = result.group('title').strip()
3703
3704         # Get the embed page
3705         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3706         if result is None:
3707             raise ExtractorError(u'ERROR: unable to extract embed page')
3708
3709         embed_page_url = result.group(0).strip()
3710         video_id = result.group('videoid')
3711
3712         webpage = self._download_webpage(embed_page_url, video_id)
3713
3714         # Get the video URL
3715         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3716         if result is None:
3717             raise ExtractorError(u'ERROR: unable to extract video url')
3718         video_url = result.group('source')
3719
3720         info = {'id': video_id,
3721                 'url': video_url,
3722                 'title': video_title,
3723                 'ext': 'flv',
3724                 'format': 'flv',
3725                 'player_url': embed_page_url}
3726
3727         return [info]
3728
3729 class EightTracksIE(InfoExtractor):
3730     IE_NAME = '8tracks'
3731     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3732
3733     def _real_extract(self, url):
3734         mobj = re.match(self._VALID_URL, url)
3735         if mobj is None:
3736             raise ExtractorError(u'Invalid URL: %s' % url)
3737         playlist_id = mobj.group('id')
3738
3739         webpage = self._download_webpage(url, playlist_id)
3740
3741         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3742         if not m:
3743             raise ExtractorError(u'Cannot find trax information')
3744         json_like = m.group(1)
3745         data = json.loads(json_like)
3746
3747         session = str(random.randint(0, 1000000000))
3748         mix_id = data['id']
3749         track_count = data['tracks_count']
3750         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3751         next_url = first_url
3752         res = []
3753         for i in itertools.count():
3754             api_json = self._download_webpage(next_url, playlist_id,
3755                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3756                 errnote=u'Failed to download song information')
3757             api_data = json.loads(api_json)
3758             track_data = api_data[u'set']['track']
3759             info = {
3760                 'id': track_data['id'],
3761                 'url': track_data['track_file_stream_url'],
3762                 'title': track_data['performer'] + u' - ' + track_data['name'],
3763                 'raw_title': track_data['name'],
3764                 'uploader_id': data['user']['login'],
3765                 'ext': 'm4a',
3766             }
3767             res.append(info)
3768             if api_data['set']['at_last_track']:
3769                 break
3770             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3771         return res
3772
3773 class KeekIE(InfoExtractor):
3774     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3775     IE_NAME = u'keek'
3776
3777     def _real_extract(self, url):
3778         m = re.match(self._VALID_URL, url)
3779         video_id = m.group('videoID')
3780         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3781         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3782         webpage = self._download_webpage(url, video_id)
3783         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3784         title = unescapeHTML(m.group('title'))
3785         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3786         uploader = clean_html(m.group('uploader'))
3787         info = {
3788                 'id': video_id,
3789                 'url': video_url,
3790                 'ext': 'mp4',
3791                 'title': title,
3792                 'thumbnail': thumbnail,
3793                 'uploader': uploader
3794         }
3795         return [info]
3796
3797 class TEDIE(InfoExtractor):
3798     _VALID_URL=r'''http://www\.ted\.com/
3799                    (
3800                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3801                         |
3802                         ((?P<type_talk>talks)) # We have a simple talk
3803                    )
3804                    (/lang/(.*?))? # The url may contain the language
3805                    /(?P<name>\w+) # Here goes the name and then ".html"
3806                    '''
3807
3808     @classmethod
3809     def suitable(cls, url):
3810         """Receives a URL and returns True if suitable for this IE."""
3811         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3812
3813     def _real_extract(self, url):
3814         m=re.match(self._VALID_URL, url, re.VERBOSE)
3815         if m.group('type_talk'):
3816             return [self._talk_info(url)]
3817         else :
3818             playlist_id=m.group('playlist_id')
3819             name=m.group('name')
3820             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3821             return [self._playlist_videos_info(url,name,playlist_id)]
3822
3823     def _talk_video_link(self,mediaSlug):
3824         '''Returns the video link for that mediaSlug'''
3825         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3826
3827     def _playlist_videos_info(self,url,name,playlist_id=0):
3828         '''Returns the videos of the playlist'''
3829         video_RE=r'''
3830                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3831                      ([.\s]*?)data-playlist_item_id="(\d+)"
3832                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3833                      '''
3834         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3835         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3836         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3837         m_names=re.finditer(video_name_RE,webpage)
3838
3839         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3840         m_playlist = re.search(playlist_RE, webpage)
3841         playlist_title = m_playlist.group('playlist_title')
3842
3843         playlist_entries = []
3844         for m_video, m_name in zip(m_videos,m_names):
3845             video_id=m_video.group('video_id')
3846             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3847             playlist_entries.append(self.url_result(talk_url, 'TED'))
3848         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3849
3850     def _talk_info(self, url, video_id=0):
3851         """Return the video for the talk in the url"""
3852         m=re.match(self._VALID_URL, url,re.VERBOSE)
3853         videoName=m.group('name')
3854         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3855         # If the url includes the language we get the title translated
3856         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3857         title=re.search(title_RE, webpage).group('title')
3858         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3859                         "id":(?P<videoID>[\d]+).*?
3860                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3861         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3862         thumb_match=re.search(thumb_RE,webpage)
3863         info_match=re.search(info_RE,webpage,re.VERBOSE)
3864         video_id=info_match.group('videoID')
3865         mediaSlug=info_match.group('mediaSlug')
3866         video_url=self._talk_video_link(mediaSlug)
3867         info = {
3868                 'id': video_id,
3869                 'url': video_url,
3870                 'ext': 'mp4',
3871                 'title': title,
3872                 'thumbnail': thumb_match.group('thumbnail')
3873                 }
3874         return info
3875
3876 class MySpassIE(InfoExtractor):
3877     _VALID_URL = r'http://www.myspass.de/.*'
3878
3879     def _real_extract(self, url):
3880         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3881
3882         # video id is the last path element of the URL
3883         # usually there is a trailing slash, so also try the second but last
3884         url_path = compat_urllib_parse_urlparse(url).path
3885         url_parent_path, video_id = os.path.split(url_path)
3886         if not video_id:
3887             _, video_id = os.path.split(url_parent_path)
3888
3889         # get metadata
3890         metadata_url = META_DATA_URL_TEMPLATE % video_id
3891         metadata_text = self._download_webpage(metadata_url, video_id)
3892         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3893
3894         # extract values from metadata
3895         url_flv_el = metadata.find('url_flv')
3896         if url_flv_el is None:
3897             raise ExtractorError(u'Unable to extract download url')
3898         video_url = url_flv_el.text
3899         extension = os.path.splitext(video_url)[1][1:]
3900         title_el = metadata.find('title')
3901         if title_el is None:
3902             raise ExtractorError(u'Unable to extract title')
3903         title = title_el.text
3904         format_id_el = metadata.find('format_id')
3905         if format_id_el is None:
3906             format = ext
3907         else:
3908             format = format_id_el.text
3909         description_el = metadata.find('description')
3910         if description_el is not None:
3911             description = description_el.text
3912         else:
3913             description = None
3914         imagePreview_el = metadata.find('imagePreview')
3915         if imagePreview_el is not None:
3916             thumbnail = imagePreview_el.text
3917         else:
3918             thumbnail = None
3919         info = {
3920             'id': video_id,
3921             'url': video_url,
3922             'title': title,
3923             'ext': extension,
3924             'format': format,
3925             'thumbnail': thumbnail,
3926             'description': description
3927         }
3928         return [info]
3929
3930 class SpiegelIE(InfoExtractor):
3931     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3932
3933     def _real_extract(self, url):
3934         m = re.match(self._VALID_URL, url)
3935         video_id = m.group('videoID')
3936
3937         webpage = self._download_webpage(url, video_id)
3938         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3939         if not m:
3940             raise ExtractorError(u'Cannot find title')
3941         video_title = unescapeHTML(m.group(1))
3942
3943         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3944         xml_code = self._download_webpage(xml_url, video_id,
3945                     note=u'Downloading XML', errnote=u'Failed to download XML')
3946
3947         idoc = xml.etree.ElementTree.fromstring(xml_code)
3948         last_type = idoc[-1]
3949         filename = last_type.findall('./filename')[0].text
3950         duration = float(last_type.findall('./duration')[0].text)
3951
3952         video_url = 'http://video2.spiegel.de/flash/' + filename
3953         video_ext = filename.rpartition('.')[2]
3954         info = {
3955             'id': video_id,
3956             'url': video_url,
3957             'ext': video_ext,
3958             'title': video_title,
3959             'duration': duration,
3960         }
3961         return [info]
3962
3963 class LiveLeakIE(InfoExtractor):
3964
3965     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3966     IE_NAME = u'liveleak'
3967
3968     def _real_extract(self, url):
3969         mobj = re.match(self._VALID_URL, url)
3970         if mobj is None:
3971             raise ExtractorError(u'Invalid URL: %s' % url)
3972
3973         video_id = mobj.group('video_id')
3974
3975         webpage = self._download_webpage(url, video_id)
3976
3977         m = re.search(r'file: "(.*?)",', webpage)
3978         if not m:
3979             raise ExtractorError(u'Unable to find video url')
3980         video_url = m.group(1)
3981
3982         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3983         if not m:
3984             raise ExtractorError(u'Cannot find video title')
3985         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3986
3987         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3988         if m:
3989             desc = unescapeHTML(m.group('desc'))
3990         else:
3991             desc = None
3992
3993         m = re.search(r'By:.*?(\w+)</a>', webpage)
3994         if m:
3995             uploader = clean_html(m.group(1))
3996         else:
3997             uploader = None
3998
3999         info = {
4000             'id':  video_id,
4001             'url': video_url,
4002             'ext': 'mp4',
4003             'title': title,
4004             'description': desc,
4005             'uploader': uploader
4006         }
4007
4008         return [info]
4009
4010 class ARDIE(InfoExtractor):
4011     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4012     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4013     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4014
4015     def _real_extract(self, url):
4016         # determine video id from url
4017         m = re.match(self._VALID_URL, url)
4018
4019         numid = re.search(r'documentId=([0-9]+)', url)
4020         if numid:
4021             video_id = numid.group(1)
4022         else:
4023             video_id = m.group('video_id')
4024
4025         # determine title and media streams from webpage
4026         html = self._download_webpage(url, video_id)
4027         title = re.search(self._TITLE, html).group('title')
4028         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4029         if not streams:
4030             assert '"fsk"' in html
4031             raise ExtractorError(u'This video is only available after 8:00 pm')
4032
4033         # choose default media type and highest quality for now
4034         stream = max([s for s in streams if int(s["media_type"]) == 0],
4035                      key=lambda s: int(s["quality"]))
4036
4037         # there's two possibilities: RTMP stream or HTTP download
4038         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4039         if stream['rtmp_url']:
4040             self.to_screen(u'RTMP download detected')
4041             assert stream['video_url'].startswith('mp4:')
4042             info["url"] = stream["rtmp_url"]
4043             info["play_path"] = stream['video_url']
4044         else:
4045             assert stream["video_url"].endswith('.mp4')
4046             info["url"] = stream["video_url"]
4047         return [info]
4048
4049 class TumblrIE(InfoExtractor):
4050     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4051
4052     def _real_extract(self, url):
4053         m_url = re.match(self._VALID_URL, url)
4054         video_id = m_url.group('id')
4055         blog = m_url.group('blog_name')
4056
4057         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4058         webpage = self._download_webpage(url, video_id)
4059
4060         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4061         video = re.search(re_video, webpage)
4062         if video is None:
4063             self.to_screen("No video founded")
4064             return []
4065         video_url = video.group('video_url')
4066         ext = video.group('ext')
4067
4068         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4069         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4070
4071         # The only place where you can get a title, it's not complete,
4072         # but searching in other places doesn't work for all videos
4073         re_title = r'<title>(?P<title>.*?)</title>'
4074         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4075
4076         return [{'id': video_id,
4077                  'url': video_url,
4078                  'title': title,
4079                  'thumbnail': thumb,
4080                  'ext': ext
4081                  }]
4082
4083 class BandcampIE(InfoExtractor):
4084     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4085
4086     def _real_extract(self, url):
4087         mobj = re.match(self._VALID_URL, url)
4088         title = mobj.group('title')
4089         webpage = self._download_webpage(url, title)
4090         # We get the link to the free download page
4091         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4092         if m_download is None:
4093             raise ExtractorError(u'No free songs founded')
4094
4095         download_link = m_download.group(1)
4096         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
4097                        webpage, re.MULTILINE|re.DOTALL).group('id')
4098
4099         download_webpage = self._download_webpage(download_link, id,
4100                                                   'Downloading free downloads page')
4101         # We get the dictionary of the track from some javascrip code
4102         info = re.search(r'items: (.*?),$',
4103                          download_webpage, re.MULTILINE).group(1)
4104         info = json.loads(info)[0]
4105         # We pick mp3-320 for now, until format selection can be easily implemented.
4106         mp3_info = info[u'downloads'][u'mp3-320']
4107         # If we try to use this url it says the link has expired
4108         initial_url = mp3_info[u'url']
4109         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4110         m_url = re.match(re_url, initial_url)
4111         #We build the url we will use to get the final track url
4112         # This url is build in Bandcamp in the script download_bunde_*.js
4113         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4114         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4115         # If we could correctly generate the .rand field the url would be
4116         #in the "download_url" key
4117         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4118
4119         track_info = {'id':id,
4120                       'title' : info[u'title'],
4121                       'ext' : 'mp3',
4122                       'url' : final_url,
4123                       'thumbnail' : info[u'thumb_url'],
4124                       'uploader' : info[u'artist']
4125                       }
4126
4127         return [track_info]
4128
4129 class RedTubeIE(InfoExtractor):
4130     """Information Extractor for redtube"""
4131     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4132
4133     def _real_extract(self,url):
4134         mobj = re.match(self._VALID_URL, url)
4135         if mobj is None:
4136             raise ExtractorError(u'Invalid URL: %s' % url)
4137
4138         video_id = mobj.group('id')
4139         video_extension = 'mp4'        
4140         webpage = self._download_webpage(url, video_id)
4141         self.report_extraction(video_id)
4142         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4143
4144         if mobj is None:
4145             raise ExtractorError(u'Unable to extract media URL')
4146
4147         video_url = mobj.group(1)
4148         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4149         if mobj is None:
4150             raise ExtractorError(u'Unable to extract title')
4151         video_title = mobj.group(1)
4152
4153         return [{
4154             'id':       video_id,
4155             'url':      video_url,
4156             'ext':      video_extension,
4157             'title':    video_title,
4158         }]
4159         
4160 class InaIE(InfoExtractor):
4161     """Information Extractor for Ina.fr"""
4162     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4163
4164     def _real_extract(self,url):
4165         mobj = re.match(self._VALID_URL, url)
4166
4167         video_id = mobj.group('id')
4168         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4169         video_extension = 'mp4'
4170         webpage = self._download_webpage(mrss_url, video_id)
4171
4172         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4173         if mobj is None:
4174             raise ExtractorError(u'Unable to extract media URL')
4175         video_url = mobj.group(1)
4176
4177         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4178         if mobj is None:
4179             raise ExtractorError(u'Unable to extract title')
4180         video_title = mobj.group(1)
4181
4182         return [{
4183             'id':       video_id,
4184             'url':      video_url,
4185             'ext':      video_extension,
4186             'title':    video_title,
4187         }]
4188
4189 def gen_extractors():
4190     """ Return a list of an instance of every supported extractor.
4191     The order does matter; the first extractor matched is the one handling the URL.
4192     """
4193     return [
4194         YoutubePlaylistIE(),
4195         YoutubeChannelIE(),
4196         YoutubeUserIE(),
4197         YoutubeSearchIE(),
4198         YoutubeIE(),
4199         MetacafeIE(),
4200         DailymotionIE(),
4201         GoogleSearchIE(),
4202         PhotobucketIE(),
4203         YahooIE(),
4204         YahooSearchIE(),
4205         DepositFilesIE(),
4206         FacebookIE(),
4207         BlipTVUserIE(),
4208         BlipTVIE(),
4209         VimeoIE(),
4210         MyVideoIE(),
4211         ComedyCentralIE(),
4212         EscapistIE(),
4213         CollegeHumorIE(),
4214         XVideosIE(),
4215         SoundcloudSetIE(),
4216         SoundcloudIE(),
4217         InfoQIE(),
4218         MixcloudIE(),
4219         StanfordOpenClassroomIE(),
4220         MTVIE(),
4221         YoukuIE(),
4222         XNXXIE(),
4223         YouJizzIE(),
4224         PornotubeIE(),
4225         YouPornIE(),
4226         GooglePlusIE(),
4227         ArteTvIE(),
4228         NBAIE(),
4229         WorldStarHipHopIE(),
4230         JustinTVIE(),
4231         FunnyOrDieIE(),
4232         SteamIE(),
4233         UstreamIE(),
4234         RBMARadioIE(),
4235         EightTracksIE(),
4236         KeekIE(),
4237         TEDIE(),
4238         MySpassIE(),
4239         SpiegelIE(),
4240         LiveLeakIE(),
4241         ARDIE(),
4242         TumblrIE(),
4243         BandcampIE(),
4244         RedTubeIE(),
4245         InaIE(),
4246         GenericIE()
4247     ]
4248
4249 def get_info_extractor(ie_name):
4250     """Returns the info extractor class with the given ie_name"""
4251     return globals()[ie_name+'IE']